def crawl2csv(filename, start, end): """sleep sec 可以用random生成在一个范围的正态分布更好些 start, end: up主mid范围""" Q = Queue() with open(filename, 'w', encoding='utf8', newline='') as fwriter: mycsvwriter = csv.writer(fwriter) mythreads = [] pthread = Producer(Q, start=start, end=end, func=BiliUser.getVideoList, sleepsec=0.1) mythreads.append(pthread) consumer_num = 4 # 4个消费者线程 for _ in range(consumer_num): cthread = Consumer(Q, csvwriter=mycsvwriter, func=BiliVideo.store_video_simpleajax, sleepsec=0.01) mythreads.append(cthread) with Timer() as t: for thread in mythreads: thread.start() for thread in mythreads: thread.join() print('runtime - (%i_%i) - : %s' % (start, end, t.elapsed)) print('======= All Done! =======')
def crawl2db(getsession, start, end): """多线程只使用一个连接会存在一些问题,建立一个session池每个线程一个session 视频访问速率有很严格的限制,请调大sleepsec""" Q = Queue() mythreads = [] pthread = Producer(Q, start=start, end=end, func=BiliUser.getVideoList, sleepsec=0.1) mythreads.append(pthread) consumer_num = 4 # 4个消费者线程 sessions = [getsession() for _ in range(consumer_num)] for i in range(consumer_num): db_session = sessions[i] # 每个线程一个session cthread = Consumer(Q, session=db_session, func=BiliVideo.store_video_simpleajax, sleepsec=0.01) mythreads.append(cthread) with Timer() as t: for thread in mythreads: thread.start() for thread in mythreads: thread.join() for session in sessions: session.close() # db_session.close() print('runtime - (%i_%i) - : %s' % (start, end, t.elapsed)) print('======= All Done! =======')
def produce_consume(): real_path, word_path, config_path = paths() check_paths(word_path, config_path) config = get_config(config_path) try: error = check_config(config) except Exception as e: print(type(e).__name__, e) exit(1) else: if error is not None: print(error) exit(1) q = Queue() consumer = Consumer(q) for i in range(16): t = Thread(target=consumer.consume_domains) t.daemon = True t.start() Producer(q, config, word_path).get_doms() q.join() if config['write_to_file']: print_red('writing to domains.json') p = Process(target=add_data, args=(real_path, consumer.get_domains())) p.start() print_red('sleeping zzzzz...') sleep(config['interval'])
class Administrator: """ Receives and logs every message sent within the whole system. Is allowed to broadcast messages to all other workers. """ def __init__(self): self._log_consumer = Consumer('hospital', 'topic', 'localhost') self._log_queue = self._log_consumer.add_queue( routing_key='#', callback=self.process_log) self._log_consumer.start(new_thread=True) self._info_producer = Producer('info', 'fanout', 'localhost') def send_info(self, message): print('sending info: ', message) self._info_producer.send_message(message=message) def process_log(self, ch, method, properties, body): body = body.decode() log = colored('LOG: ' + body, 'yellow') print(log) ch.basic_ack(delivery_tag=method.delivery_tag)
def crawl2csv(filename, start, end): """sleep sec 可以用random生成在一个范围的正态分布更好些 start,end: aid范围""" Q = Queue() with open(filename, 'w', encoding='utf8', newline='') as fwriter: mycsvwriter = csv.writer(fwriter) mythreads = [] pthread = Producer(Q, start=start, end=end, func=lambda x: (x, ), sleepsec=0.5) mythreads.append(pthread) consumer_num = 4 # 4个消费者线程 for _ in range(consumer_num): cthread = Consumer(Q, csvwriter=mycsvwriter, func=BiliVideo.store_video, sleepsec=0.5) mythreads.append(cthread) with Timer() as t: for thread in mythreads: thread.start() for thread in mythreads: thread.join() print('runtime: %s' % t.elapsed) print('======= All Done! ======')
def crawl2db(getsession, start, end): """多线程只使用一个连接会存在一些问题,建立一个session池每个线程一个session 视频访问速率有很严格的限制,请调大sleepsec""" Q = Queue() mythreads = [] pthread = Producer(Q, start=start, end=end, func=lambda x: (x,), sleepsec=0.5) mythreads.append(pthread) consumer_num = 4 # 4个消费者线程 sessions = [getsession() for _ in range(consumer_num)] for i in range(consumer_num): db_session = sessions[i] # 每个线程一个session cthread = Consumer(Q, session=db_session, func=TddAddFocusVideo.store_video, sleepsec=0.5) mythreads.append(cthread) with Timer() as t: for thread in mythreads: thread.start() for thread in mythreads: thread.join() for session in sessions: session.close() # db_session.close() print('runtime: %s' % t.elapsed) print('======= All Done! ======')
def __init__(self): self._log_consumer = Consumer('hospital', 'topic', 'localhost') self._log_queue = self._log_consumer.add_queue( routing_key='#', callback=self.process_log) self._log_consumer.start(new_thread=True) self._info_producer = Producer('info', 'fanout', 'localhost')