def crawDailyMetalComments(link,webNet): currentList = [] startContext = SilverMetalNetSpiderUtils.returnStartContext(link, '<div class="lb_you left">') startContext = SilverMetalNetSpiderUtils.filterContextByTarget(startContext,'<ul>','</ul>') len = SilverMetalNetSpiderUtils.findAllTarget(startContext,'<li>') for i in range(len): targetContext = SilverMetalNetSpiderUtils.divisionTarget(startContext,'<li>','</li>') startContext = targetContext['nextContext'] currentContext = targetContext['targetContext'] linkUrl = webNet+ SilverMetalNetSpiderUtils.filterContextByTarget(currentContext,'href="','" target') title = SilverMetalNetSpiderUtils.filterContextByTarget(currentContext,'target=_blank>','</a>') title = SilverMetalNetSpiderUtils.removeSpecialCharacter(title) pubDate = SilverMetalNetSpiderUtils.filterContextByTarget(currentContext,'</a>','</li>') currentTime = time.strftime("%Y-%m-%d",time.localtime()) filterCurrentTime = pubDate[0:10] if currentTime ==filterCurrentTime: descriptContext = crawDailyDescriptContext(linkUrl) currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'METAL','SILVERNET']) else : break return currentList
def crawDailyDescriptContext(linkUrl): startContext = SilverMetalNetSpiderUtils.returnStartContext(linkUrl,'<div class="zhaiyao">') startContext = SilverMetalNetSpiderUtils.filterAfterContext(startContext,'<span class="intro">') filterContext =SilverMetalNetSpiderUtils.filterContextByTarget(startContext,'', '</span>') filterContext =SilverMetalNetSpiderUtils.removeSpecialCharacter(filterContext) return filterContext