def crawDailyDescriptContext(linkUrl): try: startContext = QJFinanceNetSpiderUtils.returnStartContext(linkUrl,'<div class="text">') except : return '' filterContext = QJFinanceNetSpiderUtils.filterContextByTarget(startContext,'<div class="text">','<p/>') #filterContext = QJStockNetSpiderUtils.removeSpecialCharacter(filterContext) filterContext = QJFinanceNetSpiderUtils.filterContextByTarget(filterContext,'','<') filterContext = QJFinanceNetSpiderUtils.removeSpecialCharacter(filterContext) return filterContext
def crawDailyFinanceComments(link,webNet): currentList = [] startContext = QJFinanceNetSpiderUtils.returnStartContext(link,'<div class="nl-list">') startContext = QJFinanceNetSpiderUtils.filterContextByTarget(startContext,'<ul>','</ul>') i = 0 while i < 5 : targetContext = QJFinanceNetSpiderUtils.divisionTarget(startContext,'<li>','</li>') startContext = targetContext['nextContext'] currentContext = targetContext['targetContext'] linkUrl = QJFinanceNetSpiderUtils.filterContextByTarget(currentContext,'<a href="','htm')+'htm' filterTarget = '../../../' if filterTarget in linkUrl: linkUrl = webNet + QJFinanceNetSpiderUtils.removeSpecialCharacter(linkUrl) else : linkUrl = link + QJFinanceNetSpiderUtils.removeSpecialCharacter(linkUrl) title = QJFinanceNetSpiderUtils.filterContextByTarget(currentContext,'blank">','</a>') pubDate = QJFinanceNetSpiderUtils.filterContextByTarget(currentContext,'</a>','</li>') currentTime = time.strftime("%Y-%m-%d",time.localtime()) filterCurrentTime = pubDate[0:10] if currentTime ==filterCurrentTime: descriptContext = crawDailyDescriptContext(linkUrl) currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'FINANCE','QJNET']) i += 1 return currentList