def crawDailyStockComments(link): currentList = [] startContext = ZZStockNetSpiderUtils.returnStartContext( link, '<div class="column-box">') startContext = ZZStockNetSpiderUtils.filterContextByTarget( startContext, '<ul>', '<li class="nobg">') len = ZZStockNetSpiderUtils.findAllTarget(startContext, '<li>') for i in range(len): targetContext = ZZStockNetSpiderUtils.divisionTarget( startContext, '<li>', '</li>') startContext = targetContext['nextContext'] currentContext = targetContext['targetContext'] currentYear = str(time.strftime('%Y', time.localtime( time.time()))) + '-' pubDate = currentYear + ZZStockNetSpiderUtils.filterContextByTarget( currentContext, '<span class="ctime">(', ')</span>') title = ZZStockNetSpiderUtils.filterContextByTarget( currentContext, '.html">', '</a>') linkUrl = ZZStockNetSpiderUtils.removeSpecialCharacter(currentContext) if linkUrl != '': linkUrl = ZZStockNetSpiderUtils.filterContextByTarget( linkUrl, '<ahref="', 'html">') + '.html' linkUrl = link + linkUrl #descriptContext = crawDailyStockDescriptContext(linkUrl) currentList.append([ str(uuid.uuid1()), linkUrl, title, pubDate, '', 'STOCK', 'ZZNET' ]) #print linkUrl return currentList
def crawDailyStockDescriptContext(linkUrl): startContext = ZZStockNetSpiderUtils.returnStartContext( linkUrl, '<div class="Dtext z_content" id="ozoom1" style="zoom: 100%;">') print startContext filterContext = ZZStockNetSpiderUtils.filterContextByTarget( startContext, '<p>', '</p>') print filterContext
def crawDailyStockComments(link): currentList = [] startContext = ZZStockNetSpiderUtils.returnStartContext(link,'<div class="column-box">') startContext = ZZStockNetSpiderUtils.filterContextByTarget(startContext,'<ul>','<li class="nobg">') len = ZZStockNetSpiderUtils.findAllTarget(startContext,'<li>') for i in range(len): targetContext = ZZStockNetSpiderUtils.divisionTarget(startContext,'<li>','</li>') startContext = targetContext['nextContext'] currentContext = targetContext['targetContext'] currentYear = str(time.strftime('%Y',time.localtime(time.time())))+'-' pubDate = currentYear+ZZStockNetSpiderUtils.filterContextByTarget(currentContext,'<span class="ctime">(',')</span>') title = ZZStockNetSpiderUtils.filterContextByTarget(currentContext,'.html">','</a>') linkUrl = ZZStockNetSpiderUtils.removeSpecialCharacter(currentContext) if linkUrl !='': linkUrl = ZZStockNetSpiderUtils.filterContextByTarget(linkUrl,'<ahref="','html">')+'.html' linkUrl = link + linkUrl #descriptContext = crawDailyStockDescriptContext(linkUrl) currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,'','STOCK','ZZNET']) #print linkUrl return currentList
def crawDailyStockDescriptContext(linkUrl): startContext = ZZStockNetSpiderUtils.returnStartContext(linkUrl,'<div class="Dtext z_content" id="ozoom1" style="zoom: 100%;">') print startContext filterContext = ZZStockNetSpiderUtils.filterContextByTarget(startContext,'<p>','</p>') print filterContext