def read_images(self): """ read in image samples """ worker = Worker(50) for line in self.fio_r: line = line.strip() try: item = json.loads(line) except: continue worker.add_task(self.tag_image, item) logging.debug('%d images have been read' % len(items)) worker.join()
def run_page_crawler(self): ''' listen to crawler priority queue and crawl pages ''' worker = Worker(self.args.thread) while True: # get one item from the queue # initialize a generic crawler instance page = self.queue.de_queue() if page: self.stats.crawl_in_progress += 1 page.time_dequeue = time.time() worker.add_task(self.call_crawl_page, page) if self.stats.crawl_in_progress == self.max_num_pages: break worker.join() self.shutdown = True self.log_queue.put(self.end_page_log_item)
def run_page_crawler(self): ''' listen to crawler priority queue and crawl pages ''' worker = Worker(self.args.thread) while True: # get one item from the queue # initialize a generic crawler instance page = self.queue.de_queue() if page: self.stats.crawl_in_progress += 1 page.time_dequeue = time.time() worker.add_task(self.call_crawl_page, page) if self.stats.crawl_in_progress == self.max_num_pages: break worker.join() self.shutdown = True self.log_queue.put(self.end_page_log_item)