def fetch(self): # this is not possible if we do not have sources if (len(self.source) == 0): v.print_v( v.WARN, "WARNING: no sources available for package %s!" % (self.name)) return True fetcher = fetch.fetcher(self.source, self) fetcher.fetch() if (not fetcher.success): if (len(self.source) == 0): raise PBRecipeException( "Failed to Fetch package '%s' no sources were provided! '%s'!" % (self.name, self.source)) else: raise PBRecipeException( "Failed to Fetch package '%s' sources were '%s'!" % (self.name, self.source)) # update value in inventory inv.set_state(self.name, "fetch") self.last_fetcher = fetcher v.print_v( v.DEBUG, "Setting fetched version info (%s,%s)" % (fetcher.used_source, fetcher.version)) inv.set_prop(self.name, "source", fetcher.used_source) inv.set_prop(self.name, "version", fetcher.version)
def single_thread(mytuple, QUEUE_HTMLNODE, DOWNLOAD_MODE): #打印信息: #时间 结点深度 html的长度 URL队列长度 下载的数量 过滤掉的数量 相似结点队列长度 非重结点队列长度 当前爬取的URL global TOTAL_COUNT global EXIT_FLAG global QUEUE_URLNODE stop_flag = 0 while stop_flag < 15: if mytuple[0].qsize() > 0: stop_flag = 0 node = mytuple[0].get() html = fetcher(node.url, DOWNLOAD_MODE) html_node = HtmlNode(node.url, html, timestamp(), node.depth) QUEUE_HTMLNODE.put(html_node) TOTAL_COUNT += 1 if len(html) > 0: print timestamp() + '\t' + str(node.depth) + '\t' + str( len(html)) + '\t' + str(QUEUE_URLNODE.qsize( )) + '\t' + str(TOTAL_COUNT) + '\t' + str( REFUSE_COUNT) + '\t' + str( QUEUE_SMART_NODE.qsize()) + '\t' + str( QUEUE_COMPLETE_NODE.qsize()) + '\t' + node.url else: stop_flag += 1 time.sleep(5) print "EXIT_FLAG:%s" % EXIT_FLAG EXIT_FLAG += 1
def fetch(self): # this is not possible if we do not have sources if(len(self.source) == 0): v.print_v(v.WARN, "WARNING: no sources available for package %s!"%(self.name)) return True fetcher = fetch.fetcher(self.source, self); fetcher.fetch(); if(not fetcher.success): if(len(self.source) == 0): raise PBRecipeException("Failed to Fetch package '%s' no sources were provided! '%s'!"%(self.name, self.source)) else: raise PBRecipeException("Failed to Fetch package '%s' sources were '%s'!"%(self.name, self.source)) # update value in inventory inv.set_state(self.name, "fetch"); self.last_fetcher = fetcher; v.print_v(v.DEBUG, "Setting fetched version info (%s,%s)"%(fetcher.used_source, fetcher.version)) inv.set_prop(self.name, "source", fetcher.used_source); inv.set_prop(self.name, "version", fetcher.version);
def single_thread(mytuple,QUEUE_HTMLNODE,DOWNLOAD_MODE): #打印信息: #时间 结点深度 html的长度 URL队列长度 下载的数量 过滤掉的数量 相似结点队列长度 非重结点队列长度 当前爬取的URL global TOTAL_COUNT global EXIT_FLAG global QUEUE_URLNODE stop_flag = 0 while stop_flag < 30: if mytuple[0].qsize() > 0: stop_flag = 0 node = mytuple[0].get() html = fetcher(node.url,DOWNLOAD_MODE) html_node = HtmlNode(node.url,html,timestamp(),node.depth) QUEUE_HTMLNODE.put(html_node) TOTAL_COUNT += 1 print timestamp()+'\t'+str(node.depth)+'\t'+str(len(html))+'\t'+str(QUEUE_URLNODE.qsize())+'\t'+str(TOTAL_COUNT)+'\t'+str(REFUSE_COUNT) + '\t' +str(QUEUE_SMART_NODE.qsize()) + '\t' + str(QUEUE_COMPLETE_NODE.qsize()) + '\t' + node.url else: stop_flag += 1 time.sleep(10) EXIT_FLAG += 1
def fetched(self): fetcher = fetch.fetcher(self.source, self) return fetcher.fetched()
def fetched(self): fetcher = fetch.fetcher(self.source, self); return fetcher.fetched();
def single_thread(mytuple,QUEUE_HTMLNODE,DOWNLOAD_MODE,start_urls): #打印信息: #时间 结点深度 html的长度 URL队列长度 下载的数量 过滤掉的数量 相似结点队列长度 非重结点队列长度 当前爬取的URL global TOTAL_COUNT global EXIT_FLAG global QUEUE_URLNODE stop_flag = 0 while stop_flag < 15: if mytuple[0].qsize() > 0: stop_flag = 0 node = mytuple[0].get() html = fetcher(node.url,DOWNLOAD_MODE) html_node = HtmlNode(node.url,html,timestamp(),node.depth) QUEUE_HTMLNODE.put(html_node) TOTAL_COUNT += 1 if len(html) > 0: #print timestamp()+'\t'+str(node.depth)+'\t'+str(len(html))+'\t'+str(QUEUE_URLNODE.qsize())+'\t'+str(TOTAL_COUNT)+'\t'+str(REFUSE_COUNT) + '\t' +str(QUEUE_SMART_NODE.qsize()) + '\t' + str(QUEUE_COMPLETE_NODE.qsize()) + '\t' + node.url print str(node.depth)+'\t'+str(QUEUE_SMART_NODE.qsize()) + '\t' + node.url result = getform(node.url) dbconfig = {'host': '127.0.0.1','user': '******','passwd': 'mysqlroot','port': 3307,'db':'w3a_scan','charset': 'utf8'} db = MySQL(dbconfig) ''' SET NAMES utf8; SET FOREIGN_KEY_CHECKS = 0; -- ---------------------------- -- Table structure for `spider_result` -- ---------------------------- DROP TABLE IF EXISTS `spider_result`; CREATE TABLE `spider_result` ( `id` int(11) NOT NULL AUTO_INCREMENT, `start_url` varchar(200) NOT NULL, `url` varchar(500) NOT NULL, `newurl` varchar(500) NOT NULL, `data` varchar(4000) NOT NULL, `time` varchar(200) NOT NULL, PRIMARY KEY (`id`) ) ENGINE=MyISAM DEFAULT CHARSET=utf8; SET FOREIGN_KEY_CHECKS = 1; ''' if result is not None: for lists in result: data = base64.b64encode(json.dumps(lists["probe"])) newurl = lists["probe"]['url'] #print lists["probe"]["payload"] #print type(lists["probe"]["payload"]) sqlSearch = "select data from spider_result where newurl= '%s' and start_url = '%s'" % (newurl,start_urls[0]) #print "查询语句"+sqlSearch #print "查询结果是"+str(db.query(sql=sqlSearch)) if int(db.query(sql=sqlSearch)) > 0: #print "重复条件满足" result = db.fetchAllRows() for row in result: result = json.loads(base64.b64decode(row[0])) cmps = cmp(lists["probe"]["payload"].encode("utf-8"),result["payload"]) if int(cmps) == 0: #print "重复了" continue else: #print "已有记录,但是payload不一致,重新插入" sqlInsert = "insert into spider_result(id,start_url,url,newurl,data,time) values ('','%s','%s','%s','%s','%s')" % (start_urls[0],node.url,newurl,data,timestamp()) print db.insert(sql=sqlInsert), 'from'+'\t'+ str(node.url)+'\t'+'insert'+'\t'+str(newurl)+'\t'+str(timestamp()) continue #不曾查询到,全新直接插入 else: sqlInsert = "insert into spider_result(id,start_url,url,newurl,data,time) values ('','%s','%s','%s','%s','%s')" % (start_urls[0],node.url,newurl,data,timestamp()) #print sqlInsert print db.insert(sql=sqlInsert),'from'+'\t'+ str(node.url)+'\t'+'insert'+'\t'+str(newurl)+'\t'+str(timestamp()) # pass #else: # print data else: stop_flag += 1 time.sleep(5) EXIT_FLAG += 1