def crawSXMetalComments(link): startContext = JTMetalNetSpiderUtils.returnStartContext(link,'<ul class="tab_conbox" id="tab_conbox2">') targetContext = JTMetalNetSpiderUtils.filterContextByTarget(startContext,'<div>','</div>') currentList = [] linkUrl = link+JTMetalNetSpiderUtils.filterContextByTarget(targetContext,'<a href="','" title') title = JTMetalNetSpiderUtils.filterContextByTarget(targetContext,'<font style="color:red;" >','</font>') #currentTime = time.strftime("%Y-%m-%d",time.localtime()) pubDate = JTMetalNetSpiderUtils.filterContextByTarget(targetContext,'<span>','</span>') descriptContext = crawDescriptContext(linkUrl) currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'METAL','GTNET']) return currentList
def crawDailyMetalComments(link,webLink): currentList = [] startContext = JTMetalNetSpiderUtils.returnStartContext(link,'<ul id="lie">') startContext = JTMetalNetSpiderUtils.filterContextByTarget(startContext,'<ul id="lie">','</ul>') i = 0 while i <7: targetContext = JTMetalNetSpiderUtils.divisionTarget(startContext,'<li>','</li>') startContext = targetContext['nextContext'] currentContext = targetContext['targetContext'] linkUrl = webLink + JTMetalNetSpiderUtils.filterContextByTarget(currentContext,'<a href="','" title') title = JTMetalNetSpiderUtils.filterContextByTarget(currentContext,'title="','">') pubDate = JTMetalNetSpiderUtils.filterContextByTarget(currentContext,'<span>','</span>') descriptContext = crawDescriptContext(linkUrl) currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'METAL','GTNET']) i += 1 return currentList
def crawDescriptContext(link): startContext = JTMetalNetSpiderUtils.returnStartContext(link,'</strong></p>') startContext = JTMetalNetSpiderUtils.filterAfterContext(startContext,'</strong></p>') descriptContext = JTMetalNetSpiderUtils.filterContextByTarget(startContext,'<p>','</p>') descriptContext = JTMetalNetSpiderUtils.removeSpecialCharacter(descriptContext) return descriptContext