Esempio n. 1
0
 def fetch(self):
     # this is not possible if we do not have sources
     if (len(self.source) == 0):
         v.print_v(
             v.WARN,
             "WARNING: no sources available for package %s!" % (self.name))
         return True
     fetcher = fetch.fetcher(self.source, self)
     fetcher.fetch()
     if (not fetcher.success):
         if (len(self.source) == 0):
             raise PBRecipeException(
                 "Failed to Fetch package '%s' no sources were provided! '%s'!"
                 % (self.name, self.source))
         else:
             raise PBRecipeException(
                 "Failed to Fetch package '%s' sources were '%s'!" %
                 (self.name, self.source))
     # update value in inventory
     inv.set_state(self.name, "fetch")
     self.last_fetcher = fetcher
     v.print_v(
         v.DEBUG, "Setting fetched version info (%s,%s)" %
         (fetcher.used_source, fetcher.version))
     inv.set_prop(self.name, "source", fetcher.used_source)
     inv.set_prop(self.name, "version", fetcher.version)
Esempio n. 2
0
def single_thread(mytuple, QUEUE_HTMLNODE, DOWNLOAD_MODE):
    #打印信息:
    #时间  结点深度  html的长度  URL队列长度  下载的数量  过滤掉的数量  相似结点队列长度  非重结点队列长度  当前爬取的URL
    global TOTAL_COUNT
    global EXIT_FLAG
    global QUEUE_URLNODE
    stop_flag = 0
    while stop_flag < 15:
        if mytuple[0].qsize() > 0:
            stop_flag = 0
            node = mytuple[0].get()
            html = fetcher(node.url, DOWNLOAD_MODE)
            html_node = HtmlNode(node.url, html, timestamp(), node.depth)
            QUEUE_HTMLNODE.put(html_node)
            TOTAL_COUNT += 1
            if len(html) > 0:
                print timestamp() + '\t' + str(node.depth) + '\t' + str(
                    len(html)) + '\t' + str(QUEUE_URLNODE.qsize(
                    )) + '\t' + str(TOTAL_COUNT) + '\t' + str(
                        REFUSE_COUNT) + '\t' + str(
                            QUEUE_SMART_NODE.qsize()) + '\t' + str(
                                QUEUE_COMPLETE_NODE.qsize()) + '\t' + node.url

        else:
            stop_flag += 1
            time.sleep(5)
    print "EXIT_FLAG:%s" % EXIT_FLAG
    EXIT_FLAG += 1
Esempio n. 3
0
 def fetch(self):
     # this is not possible if we do not have sources
     if(len(self.source) == 0):
         v.print_v(v.WARN, "WARNING: no sources available for package %s!"%(self.name))
         return True
     fetcher = fetch.fetcher(self.source, self);
     fetcher.fetch();
     if(not fetcher.success):
         if(len(self.source) == 0):
             raise PBRecipeException("Failed to Fetch package '%s' no sources were provided! '%s'!"%(self.name, self.source))
         else:
             raise PBRecipeException("Failed to Fetch package '%s' sources were '%s'!"%(self.name, self.source))
     # update value in inventory
     inv.set_state(self.name, "fetch");
     self.last_fetcher = fetcher;
     v.print_v(v.DEBUG, "Setting fetched version info (%s,%s)"%(fetcher.used_source, fetcher.version))
     inv.set_prop(self.name, "source", fetcher.used_source);
     inv.set_prop(self.name, "version", fetcher.version);
Esempio n. 4
0
def single_thread(mytuple,QUEUE_HTMLNODE,DOWNLOAD_MODE):
    #打印信息: 
    #时间  结点深度  html的长度  URL队列长度  下载的数量  过滤掉的数量  相似结点队列长度  非重结点队列长度  当前爬取的URL
    global TOTAL_COUNT
    global EXIT_FLAG
    global QUEUE_URLNODE
    stop_flag = 0
    while stop_flag < 30:
        if mytuple[0].qsize() > 0:
            stop_flag = 0
            node = mytuple[0].get()
            html = fetcher(node.url,DOWNLOAD_MODE)
            html_node = HtmlNode(node.url,html,timestamp(),node.depth)
            QUEUE_HTMLNODE.put(html_node)
            TOTAL_COUNT += 1
            print timestamp()+'\t'+str(node.depth)+'\t'+str(len(html))+'\t'+str(QUEUE_URLNODE.qsize())+'\t'+str(TOTAL_COUNT)+'\t'+str(REFUSE_COUNT) + '\t' +str(QUEUE_SMART_NODE.qsize()) + '\t' + str(QUEUE_COMPLETE_NODE.qsize()) + '\t' + node.url

        else:
            stop_flag += 1
            time.sleep(10)
    EXIT_FLAG += 1
Esempio n. 5
0
 def fetched(self):
     fetcher = fetch.fetcher(self.source, self)
     return fetcher.fetched()
Esempio n. 6
0
 def fetched(self):
     fetcher = fetch.fetcher(self.source, self);
     return fetcher.fetched();
Esempio n. 7
0
def single_thread(mytuple,QUEUE_HTMLNODE,DOWNLOAD_MODE,start_urls):
    #打印信息:
    #时间  结点深度  html的长度  URL队列长度  下载的数量  过滤掉的数量  相似结点队列长度  非重结点队列长度  当前爬取的URL
    global TOTAL_COUNT
    global EXIT_FLAG
    global QUEUE_URLNODE
    stop_flag = 0
    while stop_flag < 15:
        if mytuple[0].qsize() > 0:
            stop_flag = 0
            node = mytuple[0].get()
            html = fetcher(node.url,DOWNLOAD_MODE)
            html_node = HtmlNode(node.url,html,timestamp(),node.depth)
            QUEUE_HTMLNODE.put(html_node)
            TOTAL_COUNT += 1
            if len(html) > 0:
                #print timestamp()+'\t'+str(node.depth)+'\t'+str(len(html))+'\t'+str(QUEUE_URLNODE.qsize())+'\t'+str(TOTAL_COUNT)+'\t'+str(REFUSE_COUNT) + '\t' +str(QUEUE_SMART_NODE.qsize()) + '\t' + str(QUEUE_COMPLETE_NODE.qsize()) + '\t' + node.url
                print str(node.depth)+'\t'+str(QUEUE_SMART_NODE.qsize()) + '\t' + node.url
                result = getform(node.url)

                dbconfig = {'host': '127.0.0.1','user': '******','passwd': 'mysqlroot','port': 3307,'db':'w3a_scan','charset': 'utf8'}
                db = MySQL(dbconfig)
                '''
                SET NAMES utf8;
                SET FOREIGN_KEY_CHECKS = 0;

                -- ----------------------------
                --  Table structure for `spider_result`
                -- ----------------------------
                DROP TABLE IF EXISTS `spider_result`;
                CREATE TABLE `spider_result` (
                  `id` int(11) NOT NULL AUTO_INCREMENT,
                  `start_url` varchar(200) NOT NULL,
                  `url` varchar(500) NOT NULL,
                  `newurl` varchar(500) NOT NULL,
                  `data` varchar(4000) NOT NULL,
                  `time` varchar(200) NOT NULL,
                  PRIMARY KEY (`id`)
                ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
                SET FOREIGN_KEY_CHECKS = 1;
                '''
                if result is not None:
                    for lists in result:
                        data = base64.b64encode(json.dumps(lists["probe"]))
                        newurl = lists["probe"]['url']
                        #print lists["probe"]["payload"]
                        #print type(lists["probe"]["payload"])
                        sqlSearch = "select data from spider_result where newurl= '%s' and start_url = '%s'" % (newurl,start_urls[0])
                        #print "查询语句"+sqlSearch
                        #print "查询结果是"+str(db.query(sql=sqlSearch))
                        if int(db.query(sql=sqlSearch)) > 0:
                            #print "重复条件满足"
                            result = db.fetchAllRows()
                            for row in result:
                                result = json.loads(base64.b64decode(row[0]))
                                cmps = cmp(lists["probe"]["payload"].encode("utf-8"),result["payload"])
                                if int(cmps) == 0:
                                    #print "重复了"
                                    continue
                                else:
                                    #print "已有记录,但是payload不一致,重新插入"
                                    sqlInsert = "insert into spider_result(id,start_url,url,newurl,data,time) values ('','%s','%s','%s','%s','%s')" % (start_urls[0],node.url,newurl,data,timestamp())
                                    print db.insert(sql=sqlInsert), 'from'+'\t'+ str(node.url)+'\t'+'insert'+'\t'+str(newurl)+'\t'+str(timestamp())
                                    continue
                        #不曾查询到,全新直接插入
                        else:
                            sqlInsert = "insert into spider_result(id,start_url,url,newurl,data,time) values ('','%s','%s','%s','%s','%s')" % (start_urls[0],node.url,newurl,data,timestamp())
                            #print sqlInsert
                            print db.insert(sql=sqlInsert),'from'+'\t'+ str(node.url)+'\t'+'insert'+'\t'+str(newurl)+'\t'+str(timestamp())
                #    pass
                #else:
                #    print data

        else:
            stop_flag += 1
            time.sleep(5)
    EXIT_FLAG += 1