def collect(self): queue = FIFOQueue( self.task_info["redis_host"], self.task_info["redis_port"], self.task_info["parser_queue_key"] ) pipeline = MongodbPipeline(self.task_info["db_host"], self.task_info["db_port"], self.task_info["db_name"]) parser = JiebaParser() # TODO shutdown signal while True: if len(queue) > 0: page_id = queue.pop() item = pipeline.find(self.task_info["spider_stored_table"], page_id) terms = parser.segment(item["content"]) terms_count = len(terms) # update item information to db item["terms"] = terms pipeline.update(self.task_info["spider_stored_table"], page_id, item) # connect to master self.rpc_proxy.server.message(self.name, "Parse page[%s] and get %d terms" % (page_id, terms_count)) print ("Parse page[%s] and get %d terms" % (page_id, terms_count)) else: print "Wait for tasks..." time.sleep(3)
def collect(self): queue = FIFOQueue(self.task_info['redis_host'], self.task_info['redis_port'], self.task_info['parser_queue_key']) pipeline = MongodbPipeline(self.task_info['db_host'], self.task_info['db_port'], self.task_info['db_name']) parser = JiebaParser() # TODO shutdown signal while True: if len(queue) > 0: page_id = queue.pop() item = pipeline.find(self.task_info['spider_stored_table'], page_id) terms = parser.segment(item['content']) terms_count = len(terms) # update item information to db item['terms'] = terms pipeline.update(self.task_info["spider_stored_table"], page_id, item) # connect to master self.rpc_proxy.server.message( self.name, "Parse page[%s] and get %d terms" % (page_id, terms_count)) print("Parse page[%s] and get %d terms" % (page_id, terms_count)) else: print "Wait for tasks..." time.sleep(3)