Exemple #1
0
def cmd_crawl(args, options):
    if len(args) != 1:
        logging.error("Missing build URL")
        return 1
    if options.to_file and not os.path.exists(options.to_file):
        os.mkdir(options.to_file)
    if options.from_file and not os.path.exists(options.from_file):
        os.mkdir(options.from_file)
    db = open_db(options)
    crawl = Crawl(db, options)
    if options.reverse:
        roots = crawl.reverse_crawl(args[0])
    else:
        roots = crawl.crawl(args[0])
    close_db(db)
    stat = roots[0].extra
    logging.info("Started: %s\n\tend: %s\n\telapsed: %s\n\tduration: %ss\n\tNb builds: %s\n\ttrhoughput: %s\n" % (
            stat['start'], stat['stop'], stat['elapsed'], stat['duration'], stat['count'], stat['throughput']))
    if not options.output:
        svg_file = roots[0].getId() + ".svg"
    else:
        svg_file = options.output
    graphviz(roots, svg_file)
    logging.info("%s generated." % svg_file)
    return 0
Exemple #2
0
def thread_func(filename,cur):
    c = Crawl()
    
    #读取文件
    f = open('uploads/'+filename,'r')
    i = 1
    while 1:
        print(cur,i)
       
        line = f.readline().strip('\n')
        if i<=cur:
            i = i+1
            continue
        rs = Setting.query.filter_by(name='is_crawl').first()
        
        if rs.value == '0':
            break
        if not line:
            break
        time.sleep(1)
        flag = c.crawl(line)
        
        if flag:
            db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="2",opt_time=int(time.time())))
            db.session.commit()
        else:
            db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="1",opt_time=int(time.time())))
            db.session.commit()
        pass # do something
    f.close()
Exemple #3
0
class Work:
    def __init__(self):
        self.c=Crawl()
        self.e=Excel()

    def thread_it(self,func):
        # 创建线程
        t = threading.Thread(target=func)
        # 守护线程
        t.setDaemon(True)
        # 启动
        t.start()

    def setUp(self):
        #pb.start()
        self.c.setUp()
        #pb.stop()

    def crawl(self):
        var.set('')
        start_row=int(start.get())
        end_row=int(end.get())
        list=self.e.get_title_list(start_row,end_row)#title_list
        print(list,flush=True)
        self.c.crawl(list)
        time.sleep(2)
        start.delete(0,tk.END)
        end.delete(0,tk.END)
        time.sleep(1)
        start.insert(0,end_row+1)
        end.insert(0,end_row+4)
        num=end_row-start_row+1
        var.set('请输入'+str(num)+'个结果 ')
        #num_list=c.insert() 
        #self.e.write_num(num_list)

    def insert(self):
        num=e.get()
        num_list=[int(i) for i in re.split('[,,]',num)]
        print(num_list,flush=True)
        self.e.write_num(num_list)
        e.delete(0,tk.END)
        var.set('数据已导入 ')

    def tearDown(self):
       self.c.tearDown()
Exemple #4
0
def crawl(args):
    if args.crawl:
        crawler = Crawl(args)
        crawler.crawl()
Exemple #5
0
def get_result():
    logger.info(request.args['keywords'])
    crawl = Crawl()
    result = crawl.crawl(request.args['keywords'])
    logger.info(result)
    return render_template('result.html', result=result)
Exemple #6
0
class MasterThread:
    def __init__(self):
        self.count = {
            'count': 0,  #爬取总数
            'failed_count': 0,  #爬取失败总数
            'sucess_count': 0,  #成功爬取总数
            'start_time': time.asctime(),  #开始时间
            'end_time': 0,  #结束时间
        }
        self.endtime = time.localtime().tm_min + 1
        self.proxy = next(proxies)
        self.Crawl = Crawl()
        self.Crawl.proxy = self.proxy

        self.Taskqueue = Queue()
        self.Urlqueue = Queue()

    def send(self):
        """
        爬取市级酒店的网址并加入队列
        """
        cities = self.Crawl.get_cities_url()
        for i in cities:
            self.Taskqueue.put({i: cities[i]})
        self.count['count'] = self.Taskqueue.qsize()

    def recv(self):
        """获取所有酒店网址并加入队列"""
        if not self.Taskqueue.empty():
            n = self.Taskqueue.get()
            url = [n[i] for i in n][0]
            self.log()
            hotel_list = self.Crawl.get_hotel_list(url)
            if hotel_list == []:
                self.count['failed_count'] += 1
                self.Taskqueue.put(n)
            for t in hotel_list:
                self.Urlqueue.put(t)
            print(self.Urlqueue.qsize())

    def start(self):
        """运行"""
        if Datafile.is_exit():  #断点续传
            link = Datafile.open_csv()
            for t in link:
                self.Urlqueue.put(t[0])
        else:
            boot_threading = threading.Thread(target=self.send)
            boot_threading.start()
            boot_threading.join()

            for i in range(self.count['count']):
                t = threading.Thread(target=self.recv, )
                t.start()
                t.join()
            if self.count['failed_count'] != 0:
                for i in range(self.count['failed_count']):
                    t = threading.Thread(target=self.recv, )
                    t.start()
                    t.join()
        self.count['count'] = self.Urlqueue.qsize()
        self.count['failed_count'] = 0

        thread_list = []
        for s in range(thread_count):
            workerthread = threading.Thread(target=self.run, )
            thread_list.append(workerthread)
        for t in thread_list:
            t.start()
            t.join()
        Datafile.save('data')
        while not self.Urlqueue.empty():
            sa = [self.Urlqueue.get()]
            Datafile.dumps(sa)
        Datafile.save('rest')
        print('*******************')
        self.log()

    def run(self):
        """工作线程"""
        while Datafile.d.qsize() < 10000:
            if not self.Urlqueue.empty():
                url = self.Urlqueue.get()
                data = self.Crawl.crawl(url)
                print(data)
                print(self.Crawl.proxy)
                if data:
                    self.count['sucess_count'] += 1
                    Datafile.dumps(data)
                else:
                    self.count['failed_count'] += 1
                    self.Urlqueue.put(url)
            #Datafile.dumps(data)
            self.log()

    def log(self):
        """
        每间隔5分钟切换IP
        """
        starttime = time.localtime().tm_min
        print('爬取总数: ', self.count['sucess_count'])
        print('失败总数:', self.count['failed_count'])
        print('酒店总数', self.Urlqueue.qsize())
        print('开始时间:', self.count['start_time'])
        print('————————————————————')
        if starttime == self.endtime:
            self.Crawl.proxy = next(proxies)
            self.endtime = starttime + 60
        if self.Urlqueue.empty():
            self.count['end_time'] = time.asctime()
            print('结束时间:', self.count['end_time'])