def cmd_crawl(args, options): if len(args) != 1: logging.error("Missing build URL") return 1 if options.to_file and not os.path.exists(options.to_file): os.mkdir(options.to_file) if options.from_file and not os.path.exists(options.from_file): os.mkdir(options.from_file) db = open_db(options) crawl = Crawl(db, options) if options.reverse: roots = crawl.reverse_crawl(args[0]) else: roots = crawl.crawl(args[0]) close_db(db) stat = roots[0].extra logging.info("Started: %s\n\tend: %s\n\telapsed: %s\n\tduration: %ss\n\tNb builds: %s\n\ttrhoughput: %s\n" % ( stat['start'], stat['stop'], stat['elapsed'], stat['duration'], stat['count'], stat['throughput'])) if not options.output: svg_file = roots[0].getId() + ".svg" else: svg_file = options.output graphviz(roots, svg_file) logging.info("%s generated." % svg_file) return 0
def thread_func(filename,cur): c = Crawl() #读取文件 f = open('uploads/'+filename,'r') i = 1 while 1: print(cur,i) line = f.readline().strip('\n') if i<=cur: i = i+1 continue rs = Setting.query.filter_by(name='is_crawl').first() if rs.value == '0': break if not line: break time.sleep(1) flag = c.crawl(line) if flag: db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="2",opt_time=int(time.time()))) db.session.commit() else: db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="1",opt_time=int(time.time()))) db.session.commit() pass # do something f.close()
class Work: def __init__(self): self.c=Crawl() self.e=Excel() def thread_it(self,func): # 创建线程 t = threading.Thread(target=func) # 守护线程 t.setDaemon(True) # 启动 t.start() def setUp(self): #pb.start() self.c.setUp() #pb.stop() def crawl(self): var.set('') start_row=int(start.get()) end_row=int(end.get()) list=self.e.get_title_list(start_row,end_row)#title_list print(list,flush=True) self.c.crawl(list) time.sleep(2) start.delete(0,tk.END) end.delete(0,tk.END) time.sleep(1) start.insert(0,end_row+1) end.insert(0,end_row+4) num=end_row-start_row+1 var.set('请输入'+str(num)+'个结果 ') #num_list=c.insert() #self.e.write_num(num_list) def insert(self): num=e.get() num_list=[int(i) for i in re.split('[,,]',num)] print(num_list,flush=True) self.e.write_num(num_list) e.delete(0,tk.END) var.set('数据已导入 ') def tearDown(self): self.c.tearDown()
def crawl(args): if args.crawl: crawler = Crawl(args) crawler.crawl()
def get_result(): logger.info(request.args['keywords']) crawl = Crawl() result = crawl.crawl(request.args['keywords']) logger.info(result) return render_template('result.html', result=result)
class MasterThread: def __init__(self): self.count = { 'count': 0, #爬取总数 'failed_count': 0, #爬取失败总数 'sucess_count': 0, #成功爬取总数 'start_time': time.asctime(), #开始时间 'end_time': 0, #结束时间 } self.endtime = time.localtime().tm_min + 1 self.proxy = next(proxies) self.Crawl = Crawl() self.Crawl.proxy = self.proxy self.Taskqueue = Queue() self.Urlqueue = Queue() def send(self): """ 爬取市级酒店的网址并加入队列 """ cities = self.Crawl.get_cities_url() for i in cities: self.Taskqueue.put({i: cities[i]}) self.count['count'] = self.Taskqueue.qsize() def recv(self): """获取所有酒店网址并加入队列""" if not self.Taskqueue.empty(): n = self.Taskqueue.get() url = [n[i] for i in n][0] self.log() hotel_list = self.Crawl.get_hotel_list(url) if hotel_list == []: self.count['failed_count'] += 1 self.Taskqueue.put(n) for t in hotel_list: self.Urlqueue.put(t) print(self.Urlqueue.qsize()) def start(self): """运行""" if Datafile.is_exit(): #断点续传 link = Datafile.open_csv() for t in link: self.Urlqueue.put(t[0]) else: boot_threading = threading.Thread(target=self.send) boot_threading.start() boot_threading.join() for i in range(self.count['count']): t = threading.Thread(target=self.recv, ) t.start() t.join() if self.count['failed_count'] != 0: for i in range(self.count['failed_count']): t = threading.Thread(target=self.recv, ) t.start() t.join() self.count['count'] = self.Urlqueue.qsize() self.count['failed_count'] = 0 thread_list = [] for s in range(thread_count): workerthread = threading.Thread(target=self.run, ) thread_list.append(workerthread) for t in thread_list: t.start() t.join() Datafile.save('data') while not self.Urlqueue.empty(): sa = [self.Urlqueue.get()] Datafile.dumps(sa) Datafile.save('rest') print('*******************') self.log() def run(self): """工作线程""" while Datafile.d.qsize() < 10000: if not self.Urlqueue.empty(): url = self.Urlqueue.get() data = self.Crawl.crawl(url) print(data) print(self.Crawl.proxy) if data: self.count['sucess_count'] += 1 Datafile.dumps(data) else: self.count['failed_count'] += 1 self.Urlqueue.put(url) #Datafile.dumps(data) self.log() def log(self): """ 每间隔5分钟切换IP """ starttime = time.localtime().tm_min print('爬取总数: ', self.count['sucess_count']) print('失败总数:', self.count['failed_count']) print('酒店总数', self.Urlqueue.qsize()) print('开始时间:', self.count['start_time']) print('————————————————————') if starttime == self.endtime: self.Crawl.proxy = next(proxies) self.endtime = starttime + 60 if self.Urlqueue.empty(): self.count['end_time'] = time.asctime() print('结束时间:', self.count['end_time'])