def CPUStage(cls, data): content = data[0] url = data[1] print "%s get url : %s" % (cls.Name(), url) imgSet = cls.GetImgUrls(url, content, cls.picFmt) #提取网页中图片的url maxUnhealImgCnt = len(imgSet) * float( cls.unhealthrate) #根据设定的不良图片比例,计算出网页中不良图片的数量 unHealthImgCnt = 0 for imgurl in imgSet: imgtype, content = Common.GetContentByUrl(imgurl) #抓取图片 if content != None and cls.Parser.IsInvalidImg( content, imgtype, imgurl): #分析是否为不良图片 unHealthImgCnt += 1 if unHealthImgCnt >= maxUnhealImgCnt: #网页中不良图片数量超过阈值maxUnhealImgCnt picName = str(random.randint(0, 10000000)) + '.jpg' master_LogCmd.WriteTaskData( "!!!! found invalid html by %s: url: %s, picname:%s" % (cls.Name(), url, picName)) os.popen('phantomjs snapshot.js %s %s' % (url, picName)) #网页截图 break
def IOStage(cls, url): #在redis中记录当前抓取的线程id key = "%s_%d" % (cls.mac, threading.currentThread().ident) cls.FrameInfo.hset(key, url) # 抓取数据 contenttype, content = Common.GetContentByUrl(url) #在redis更新抓取网页统计值 urlHandleCnt = int(cls.FrameInfo.hget(cls.handleCnt)) + 1 cls.FrameInfo.hset(cls.handleCnt, urlHandleCnt) cls.FrameInfo.hdel(key) #抓取异常判断 if content == None or contenttype == None or content == '': master_LogCmd.WriteTaskData("Failed : %s" % url) return None #对于网页内容,则触发“基于文本分析不良网页的任务” if contenttype.find("text/html") != -1: #是网页资源 slaver_WordAnalysisCmd.WriteTaskData(content, url) return None