def crawFinanceHLDataSource(link): currentList = [] target ='<div class="show">' startContext = TakFinanceHLNetSpiderUtils.returnStartContext(link,target) startContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,target, '<div class="hot">') startContext = TakFinanceHLNetSpiderUtils.removeSpecialCharacter(startContext) linkUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'<ahref="','"target') startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext,'src') imageUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'class="','"/></a>') startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext,'<h3>') title = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'blank">','</a>') descriptContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'</h3>','</span>') descriptContext = TakFinanceHLNetSpiderUtils.removeSpecialCharacter(descriptContext) pubDate = time.strftime("%Y-%m-%d",time.localtime()) currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'MACRO','CCTVCHINA']) return currentList
def writeFinanceHLDataSource(): link = 'http://www.nbd.com.cn/columns/2' currentList = crawFinanceHLDataSource(link) conn = TakFinanceHLNetSpiderUtils.getMySQLConn() cursor = conn.cursor() try: cursor.execute("DELETE FROM HEADLINE_FINANCENEWS_RESOURCE_TABLE WHERE SOURCEFLAG = 'NBDCHINA'") conn.commit() except conn.Error,e: print "Mysql Error %d: %s" % (e.args[0], e.args[1]) conn.rollback()
def crawFinanceHLDataSource(link): currentList = [] target = '<div class="p3_box">' startContext = TakFinanceHLNetSpiderUtils.returnStartContext(link, target) startContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext, target, '<div class="clear"></div>') startContext = TakFinanceHLNetSpiderUtils.removeSpecialCharacter(startContext) linkUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext, '<ahref="', '"target') title = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext, 'blank">', "</a></div>") imageUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext, '<imgsrc="', '"border') startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext, '<divclass="summary">') descriptContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext, "", "<ahref") pubDate = time.strftime("%Y-%m-%d %X", time.localtime()) currentList.append([str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, "MACRO", "TAKCHINA"]) return currentList
def crawFinanceHLDataSource(link): currentList = [] target ='<ul class="articles unorderList unorderList-orange">' startContext = TakFinanceHLNetSpiderUtils.returnStartContext(link,target) startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext, target) startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext, target) startContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'<li','</li>') startContext = TakFinanceHLNetSpiderUtils.removeSpecialCharacter(startContext) linkUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'<ahref="','"target') startContext = TakFinanceHLNetSpiderUtils.filterAfterContext(startContext,'title="') title = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'>','</a>') pubDate = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'<span>','</span>') pubDate = pubDate[:10]+' '+pubDate[10:] imageUrl = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'src="','"width') descriptContext = TakFinanceHLNetSpiderUtils.filterContextByTarget(startContext,'<pclass="articleMaterial_digest_3row">','</div>') currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'MACRO','NBDCHINA']) return currentList