def collect(self): queue = FIFOQueue( self.task_info["redis_host"], self.task_info["redis_port"], self.task_info["parser_queue_key"] ) pipeline = MongodbPipeline(self.task_info["db_host"], self.task_info["db_port"], self.task_info["db_name"]) parser = JiebaParser() # TODO shutdown signal while True: if len(queue) > 0: page_id = queue.pop() item = pipeline.find(self.task_info["spider_stored_table"], page_id) terms = parser.segment(item["content"]) terms_count = len(terms) # update item information to db item["terms"] = terms pipeline.update(self.task_info["spider_stored_table"], page_id, item) # connect to master self.rpc_proxy.server.message(self.name, "Parse page[%s] and get %d terms" % (page_id, terms_count)) print ("Parse page[%s] and get %d terms" % (page_id, terms_count)) else: print "Wait for tasks..." time.sleep(3)
def collect(self): queue = FIFOQueue(self.task_info['redis_host'], self.task_info['redis_port'], self.task_info['parser_queue_key']) pipeline = MongodbPipeline(self.task_info['db_host'], self.task_info['db_port'], self.task_info['db_name']) parser = JiebaParser() # TODO shutdown signal while True: if len(queue) > 0: page_id = queue.pop() item = pipeline.find(self.task_info['spider_stored_table'], page_id) terms = parser.segment(item['content']) terms_count = len(terms) # update item information to db item['terms'] = terms pipeline.update(self.task_info["spider_stored_table"], page_id, item) # connect to master self.rpc_proxy.server.message( self.name, "Parse page[%s] and get %d terms" % (page_id, terms_count)) print("Parse page[%s] and get %d terms" % (page_id, terms_count)) else: print "Wait for tasks..." time.sleep(3)
def fetch(self): spider_queue = FIFOQueue(host=self.task_info['redis_host'], port=self.task_info['redis_port'], key=self.task_info['spider_queue_key']) task_queue = FIFOQueue(host=self.task_info['redis_host'], port=self.task_info['redis_port'], key=self.task_info['parser_queue_key']) crawler = SimpleCrawler(self.task_info['start_url'], self.task_info['allowed_domain']) dupefilter = SimpleDupefilter( host=self.task_info['redis_host'], port=self.task_info['redis_port'], key=self.task_info['spider_dupefilter_key']) pipeline = MongodbPipeline(self.task_info['db_host'], self.task_info['db_port'], self.task_info['db_name']) spider_queue.push(self.task_info['start_url']) # TODO shutdown signal while True: if len(spider_queue) > 0: current_url = spider_queue.pop() crawler.fetch(current_url) # if crawler successful fetch the content if crawler.success: item = crawler.parse() next_urls = item.get('links') next_urls_count = 0 for next_url in next_urls: if not dupefilter.exists(next_url): spider_queue.push(next_url) next_urls_count += 1 # print fetch infomation print "Crawler fetched %s and get %d urls" % ( current_url, next_urls_count) self.rpc_proxy.server.message( self.name, "Success fetched url %s." % current_url) item = pipeline.insert( item, self.task_info['spider_stored_table']) task_queue.push(item.get('_id')) self.rpc_proxy.server.message( self.name, "Stored url %s with ID %s." % (current_url, item.get('mongo_id'))) else: print "Wait for tasks..." time.sleep(3)
def fetch(self): spider_queue = FIFOQueue( host=self.task_info["redis_host"], port=self.task_info["redis_port"], key=self.task_info["spider_queue_key"] ) task_queue = FIFOQueue( host=self.task_info["redis_host"], port=self.task_info["redis_port"], key=self.task_info["parser_queue_key"] ) crawler = SimpleCrawler(self.task_info["start_url"], self.task_info["allowed_domain"]) dupefilter = SimpleDupefilter( host=self.task_info["redis_host"], port=self.task_info["redis_port"], key=self.task_info["spider_dupefilter_key"], ) pipeline = MongodbPipeline(self.task_info["db_host"], self.task_info["db_port"], self.task_info["db_name"]) spider_queue.push(self.task_info["start_url"]) # TODO shutdown signal while True: if len(spider_queue) > 0: current_url = spider_queue.pop() crawler.fetch(current_url) # if crawler successful fetch the content if crawler.success: item = crawler.parse() next_urls = item.get("links") next_urls_count = 0 for next_url in next_urls: if not dupefilter.exists(next_url): spider_queue.push(next_url) next_urls_count += 1 # print fetch infomation print "Crawler fetched %s and get %d urls" % (current_url, next_urls_count) self.rpc_proxy.server.message(self.name, "Success fetched url %s." % current_url) item = pipeline.insert(item, self.task_info["spider_stored_table"]) task_queue.push(item.get("_id")) self.rpc_proxy.server.message( self.name, "Stored url %s with ID %s." % (current_url, item.get("mongo_id")) ) else: print "Wait for tasks..." time.sleep(3)