def writeCompanyNews(): link = 'http://stock.stockstar.com/list/company.htm' currentLinkList = [link] currentContext = ThemeNewsSpiderUtils.returnStartContext(link,'<div class="pageControl">') startContext = ThemeNewsSpiderUtils.filterContextByTarget(currentContext,'<span class="current">1</span>','</a></div>') for i in [0,1,2,3,4,5]: targetContext = ThemeNewsSpiderUtils.divisionTarget(startContext, '<a', '</a>') startContext = targetContext['nextContext'] currentcontext = targetContext['targetContext'] link = 'http://stock.stockstar.com'+ThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'<a href="','" target="_self"') currentLinkList.append(link) writeCompanyNewsByLink(currentLinkList)
def writeCompanyNews(): link = 'http://stock.stockstar.com/list/company.htm' currentLinkList = [link] currentContext = ThemeNewsSpiderUtils.returnStartContext( link, '<div class="pageControl">') startContext = ThemeNewsSpiderUtils.filterContextByTarget( currentContext, '<span class="current">1</span>', '</a></div>') for i in [0, 1, 2, 3, 4, 5]: targetContext = ThemeNewsSpiderUtils.divisionTarget( startContext, '<a', '</a>') startContext = targetContext['nextContext'] currentcontext = targetContext['targetContext'] link = 'http://stock.stockstar.com' + ThemeNewsSpiderUtils.filterContextByTarget( currentcontext, '<a href="', '" target="_self"') currentLinkList.append(link) writeCompanyNewsByLink(currentLinkList)
def crawCompanyNews(link): filterContext = ThemeNewsSpiderUtils.returnStartContext( link, '<div class="listnews">') startContext = ThemeNewsSpiderUtils.filterContextByTarget( filterContext, '<ul>', '</ul>') len = ThemeNewsSpiderUtils.findAllTarget(startContext, '<li') currentList = [] for i in range(len): targetContext = ThemeNewsSpiderUtils.divisionTarget( startContext, '<li>', '</li>') startContext = targetContext['nextContext'] currentcontext = targetContext['targetContext'] keyid = str(uuid.uuid1()) linkUrl = ThemeNewsSpiderUtils.filterContextByTarget( currentcontext, '<a href="', '">') pubDate = ThemeNewsSpiderUtils.filterContextByTarget( currentcontext, '<span>', '</span>') title = ThemeNewsSpiderUtils.filterContextByTarget( currentcontext, '">', '</a>') currentTime = time.strftime("%Y-%m-%d", time.localtime()) if (pubDate[:10] != currentTime): break if linkUrl != '': currentList.append([keyid, linkUrl, pubDate, title, 'STOCKSTAR']) return currentList
def writeCompanyNewsByLink(currentLinkList): conn = ThemeNewsSpiderUtils.getMySQLConn() cursor = conn.cursor() try: cursor.execute("DELETE FROM STOCK_POOL_IMPORTANT_NEWS_TABLE") conn.commit() except conn.Error,e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) conn.rollback()
def writeCompanyNewsByLink(currentLinkList): conn = ThemeNewsSpiderUtils.getMySQLConn() cursor = conn.cursor() try: cursor.execute("DELETE FROM STOCK_POOL_IMPORTANT_NEWS_TABLE") conn.commit() except conn.Error, e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) conn.rollback()
def writeThemeDailyNewsByLink(currentLinkList): conn = ThemeNewsSpiderUtils.getMySQLConn() cursor = conn.cursor() try: cursor.execute("DELETE FROM STOCK_POOL_THEME_NEWS_TABLE WHERE SOURCEFLAG = 'STOCKNET'") conn.commit() except conn.Error, e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) conn.rollback()
def writeThemeDailyNewsByLink(currentLinkList): conn = ThemeNewsSpiderUtils.getMySQLConn() cursor = conn.cursor() try: cursor.execute( "DELETE FROM STOCK_POOL_THEME_NEWS_TABLE WHERE SOURCEFLAG = 'STOCKNET'" ) conn.commit() except conn.Error, e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) conn.rollback()
def crawThemeDailyNews(link): filterContext = ThemeNewsSpiderUtils.returnStartContext(link, '<div class="listnews">') startContext = ThemeNewsSpiderUtils.filterContextByTarget(filterContext, "<ul>", "</ul>") len = ThemeNewsSpiderUtils.findAllTarget(startContext, "<li") currentList = [] for i in range(len): targetContext = ThemeNewsSpiderUtils.divisionTarget(startContext, "<li>", "</li>") startContext = targetContext["nextContext"] currentcontext = targetContext["targetContext"] keyid = str(uuid.uuid1()) linkUrl = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext, '<a href="', '">') pubDate = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext, "<span>", "</span>") title = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext, '">', "</a>") currentTime = time.strftime("%Y-%m-%d", time.localtime()) if pubDate[:10] != currentTime: break if linkUrl != "": currentList.append([keyid, linkUrl, pubDate, title, "STOCKNET"]) return currentList
def crawCompanyNews(link): filterContext = ThemeNewsSpiderUtils.returnStartContext(link,'<div class="listnews">') startContext = ThemeNewsSpiderUtils.filterContextByTarget(filterContext,'<ul>','</ul>') len = ThemeNewsSpiderUtils.findAllTarget(startContext,'<li') currentList = [] for i in range(len): targetContext = ThemeNewsSpiderUtils.divisionTarget(startContext, '<li>', '</li>') startContext = targetContext['nextContext'] currentcontext = targetContext['targetContext'] keyid = str(uuid.uuid1()) linkUrl = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'<a href="', '">') pubDate = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'<span>','</span>') title = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'">','</a>') currentTime = time.strftime("%Y-%m-%d",time.localtime()) if(pubDate[:10]!=currentTime): break if linkUrl != '': currentList.append([keyid,linkUrl,pubDate,title,'STOCKSTAR']) return currentList
def crawCompanyNews(link): filterContext = ThemeNewsSpiderUtils.returnStartContext(link,'<div class="listnews" id="TacticNewsList1" >') startContext = ThemeNewsSpiderUtils.filterContextByTarget(filterContext,'<ul>','</ul>') len = ThemeNewsSpiderUtils.findAllTarget(startContext,'<li') newsFlag = 'good' currentList = [] for i in range(len): targetContext = ThemeNewsSpiderUtils.divisionTarget(startContext, '<li>', '</li>') startContext = targetContext['nextContext'] currentcontext = targetContext['targetContext'] keyid = str(uuid.uuid1()) linkUrl = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'<a href="', '">') pubDate = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'<span>','</span>') title = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'">','</a>') if linkUrl != '': currentList.append([keyid,linkUrl,pubDate,title,newsFlag]) currentFilterContext = ThemeNewsSpiderUtils.returnStartContext(link,'<div class="listnews" id="TacticNewsList2" style="display:none;">') currentstartContext = ThemeNewsSpiderUtils.filterContextByTarget(currentFilterContext,'<ul>','</ul>') currentlen = ThemeNewsSpiderUtils.findAllTarget(currentstartContext,'<li') newsFlag = 'bad' for m in range(currentlen): targetContext = ThemeNewsSpiderUtils.divisionTarget(currentstartContext, '<li>', '</li>') currentstartContext = targetContext['nextContext'] currentcontext = targetContext['targetContext'] keyid = str(uuid.uuid1()) linkUrl = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'<a href="', '">') pubDate = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'<span>','</span>') title = ThemeNewsSpiderUtils.filterContextByTarget(currentcontext,'">','</a>') if linkUrl != '': currentList.append([keyid,linkUrl,pubDate,title,newsFlag]) return currentList
def crawCompanyNews(link): filterContext = ThemeNewsSpiderUtils.returnStartContext( link, '<div class="listnews" id="TacticNewsList1" >') startContext = ThemeNewsSpiderUtils.filterContextByTarget( filterContext, '<ul>', '</ul>') len = ThemeNewsSpiderUtils.findAllTarget(startContext, '<li') newsFlag = 'good' currentList = [] for i in range(len): targetContext = ThemeNewsSpiderUtils.divisionTarget( startContext, '<li>', '</li>') startContext = targetContext['nextContext'] currentcontext = targetContext['targetContext'] keyid = str(uuid.uuid1()) linkUrl = ThemeNewsSpiderUtils.filterContextByTarget( currentcontext, '<a href="', '">') pubDate = ThemeNewsSpiderUtils.filterContextByTarget( currentcontext, '<span>', '</span>') title = ThemeNewsSpiderUtils.filterContextByTarget( currentcontext, '">', '</a>') if linkUrl != '': currentList.append([keyid, linkUrl, pubDate, title, newsFlag]) currentFilterContext = ThemeNewsSpiderUtils.returnStartContext( link, '<div class="listnews" id="TacticNewsList2" style="display:none;">') currentstartContext = ThemeNewsSpiderUtils.filterContextByTarget( currentFilterContext, '<ul>', '</ul>') currentlen = ThemeNewsSpiderUtils.findAllTarget(currentstartContext, '<li') newsFlag = 'bad' for m in range(currentlen): targetContext = ThemeNewsSpiderUtils.divisionTarget( currentstartContext, '<li>', '</li>') currentstartContext = targetContext['nextContext'] currentcontext = targetContext['targetContext'] keyid = str(uuid.uuid1()) linkUrl = ThemeNewsSpiderUtils.filterContextByTarget( currentcontext, '<a href="', '">') pubDate = ThemeNewsSpiderUtils.filterContextByTarget( currentcontext, '<span>', '</span>') title = ThemeNewsSpiderUtils.filterContextByTarget( currentcontext, '">', '</a>') if linkUrl != '': currentList.append([keyid, linkUrl, pubDate, title, newsFlag]) return currentList