def collect(self): queue = FIFOQueue(self.task_info['redis_host'], self.task_info['redis_port'], self.task_info['parser_queue_key']) pipeline = MongodbPipeline(self.task_info['db_host'], self.task_info['db_port'], self.task_info['db_name']) parser = JiebaParser() # TODO shutdown signal while True: if len(queue) > 0: page_id = queue.pop() item = pipeline.find(self.task_info['spider_stored_table'], page_id) terms = parser.segment(item['content']) terms_count = len(terms) # update item information to db item['terms'] = terms pipeline.update(self.task_info["spider_stored_table"], page_id, item) # connect to master self.rpc_proxy.server.message( self.name, "Parse page[%s] and get %d terms" % (page_id, terms_count)) print("Parse page[%s] and get %d terms" % (page_id, terms_count)) else: print "Wait for tasks..." time.sleep(3)
def collect(self): queue = FIFOQueue( self.task_info["redis_host"], self.task_info["redis_port"], self.task_info["parser_queue_key"] ) pipeline = MongodbPipeline(self.task_info["db_host"], self.task_info["db_port"], self.task_info["db_name"]) parser = JiebaParser() # TODO shutdown signal while True: if len(queue) > 0: page_id = queue.pop() item = pipeline.find(self.task_info["spider_stored_table"], page_id) terms = parser.segment(item["content"]) terms_count = len(terms) # update item information to db item["terms"] = terms pipeline.update(self.task_info["spider_stored_table"], page_id, item) # connect to master self.rpc_proxy.server.message(self.name, "Parse page[%s] and get %d terms" % (page_id, terms_count)) print ("Parse page[%s] and get %d terms" % (page_id, terms_count)) else: print "Wait for tasks..." time.sleep(3)
def __init__(self, master='127.0.0.1:2181', type='spider'): """ spider类型执行器 :param master: 主节点地址 :param type: 执行器类型 :return: """ Worker.__init__(self, master, type) # 注册任务队列 self.spider_queue = FIFOQueue(self.redis, self.config.get("spider_queue")) self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue")) # 注册过滤器 self.duplicate_filter = DuplicateFilter(self.redis, self.config.get("duplicate_set")) # 注册存储数据库 self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db"))
class ProcessorWorker(Worker): def __init__(self, master='127.0.0.1:2181', type='spider'): """ 处理执行器 :param master: 主节点地址 :param type: 执行器类型 :return: """ Worker.__init__(self, master, type) # 注册任务队列 self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue")) # 注册存储数据库 self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db")) def run(self, job): # 注册文章处理器 processor = DocumentProcessor() while len(self.processer_queue) > 0: self._update_on_job(True) page_id = self.processer_queue.pop() page = self.storage_pipline.find(self.config.get("page_table"), page_id) self.task = page terms = processor.process(page) # 将page_table抽取的信息存入doc_table page['terms'] = terms self.storage_pipline.update(self.config.get("page_table"), page_id, page) self._update_status(True) log("[SUCCESS] %s" % page['href']) else: self.wait_task_time += 1 self.task = None if self.wait_task_time > 5: self._update_on_job(False) log("[PROCESSOR] Wait for some jobs...") time.sleep(3) def task_stop(self): if self.task: self.processer_queue.push(self.task)
def __init__(self, master='127.0.0.1:2181', type='spider', concurrency=5, **kwargs): """ 异步爬虫执行器 :param master: 主节点地址 :param type: 执行器类型 :param concurrency: 并发数 :param kwargs: :return: """ Worker.__init__(self, master, type) # 注册任务队列 self.spider_queue = FIFOQueue(self.redis, self.config.get("spider_queue")) self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue")) # 注册过滤器 self.duplicate_filter = DuplicateFilter(self.redis, self.config.get("duplicate_set")) # 注册存储数据库 self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db")) # 并发线程数 self.concurrency = concurrency # 内部队列 self._queue = queues.Queue()
def fetch(self): spider_queue = FIFOQueue( host=self.task_info["redis_host"], port=self.task_info["redis_port"], key=self.task_info["spider_queue_key"] ) task_queue = FIFOQueue( host=self.task_info["redis_host"], port=self.task_info["redis_port"], key=self.task_info["parser_queue_key"] ) crawler = SimpleCrawler(self.task_info["start_url"], self.task_info["allowed_domain"]) dupefilter = SimpleDupefilter( host=self.task_info["redis_host"], port=self.task_info["redis_port"], key=self.task_info["spider_dupefilter_key"], ) pipeline = MongodbPipeline(self.task_info["db_host"], self.task_info["db_port"], self.task_info["db_name"]) spider_queue.push(self.task_info["start_url"]) # TODO shutdown signal while True: if len(spider_queue) > 0: current_url = spider_queue.pop() crawler.fetch(current_url) # if crawler successful fetch the content if crawler.success: item = crawler.parse() next_urls = item.get("links") next_urls_count = 0 for next_url in next_urls: if not dupefilter.exists(next_url): spider_queue.push(next_url) next_urls_count += 1 # print fetch infomation print "Crawler fetched %s and get %d urls" % (current_url, next_urls_count) self.rpc_proxy.server.message(self.name, "Success fetched url %s." % current_url) item = pipeline.insert(item, self.task_info["spider_stored_table"]) task_queue.push(item.get("_id")) self.rpc_proxy.server.message( self.name, "Stored url %s with ID %s." % (current_url, item.get("mongo_id")) ) else: print "Wait for tasks..." time.sleep(3)
class SpiderWorker(Worker): def __init__(self, master='127.0.0.1:2181', type='spider'): """ spider类型执行器 :param master: 主节点地址 :param type: 执行器类型 :return: """ Worker.__init__(self, master, type) # 注册任务队列 self.spider_queue = FIFOQueue(self.redis, self.config.get("spider_queue")) self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue")) # 注册过滤器 self.duplicate_filter = DuplicateFilter(self.redis, self.config.get("duplicate_set")) # 注册存储数据库 self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db")) def run(self, job): """ 执行方法 :param job: 任务信息 :return: """ # 注册爬虫 crawler = PhantomCrawler() parser = NormalParser(job) if len(self.spider_queue) > 0: task = eval(self.spider_queue.pop()) self.task = task # 若该任务失败次数过多,不再处理该任务 if task['life'] == 0: return response = crawler.fetch(task['url']) # success, result = crawler.fetch(task['url']) # 若爬虫成功爬取 if response['status_code'] == 200: try: item = parser.parse(task['url'], response['content']) # 分片写入 item['ram'] = random.random() new_urls = item['links'] # 抓去的新链接判重后加入队列 for new_url in new_urls: if not self.duplicate_filter.exists(new_url): self.spider_queue.push({ "url": new_url, "life": 5 }) # url原始解析结果持久化 item = self.storage_pipline.insert(self.config.get("page_table"), item) self.processer_queue.push(item.get('_id')) # 更新任务状态 self._update_status(True) log("[SUCCESS] %s." % task['url']) except Exception, e: # 将失败的url再次放入队列 self.spider_queue.push({ "url": task['url'], "life": task['life'] - 1 }) log("[FAILED] %s %s" % (task['url'], e)) else: # 更新任务状态 self._update_status(False) # 将失败的url再次放入队列 self.spider_queue.push({ "url": task['url'], "life": task['life'] - 1 }) log("[FAILED] %s %s" % (task['url'], response['status_code'])) else:
class AsyncSpiderWorker(Worker): def __init__(self, master='127.0.0.1:2181', type='spider', concurrency=5, **kwargs): """ 异步爬虫执行器 :param master: 主节点地址 :param type: 执行器类型 :param concurrency: 并发数 :param kwargs: :return: """ Worker.__init__(self, master, type) # 注册任务队列 self.spider_queue = FIFOQueue(self.redis, self.config.get("spider_queue")) self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue")) # 注册过滤器 self.duplicate_filter = DuplicateFilter(self.redis, self.config.get("duplicate_set")) # 注册存储数据库 self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db")) # 并发线程数 self.concurrency = concurrency # 内部队列 self._queue = queues.Queue() def fetch(self, url, **kwargs): fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch') return fetch(url, raise_error=False, **kwargs) def parse(self, url, html): """ 解析html页面 """ parser = NormalParser(url, html, self.job) item = parser.parse() # 分片写入 item['ram'] = random.random() new_urls = item.get('links') # 抓去的新链接判重后加入队列 for new_url in new_urls: if not self.duplicate_filter.exists(new_url): self.spider_queue.push({ "url": new_url, "life": 5 }) # url原始解析结果持久化 item = self.storage_pipline.insert(self.config.get("page_table"), item) self.processer_queue.push(item.get('_id')) self._update_status(True) log("[SUCCESS] %s." % url) def handle_response(self, task, response): """ 处理异步返回 """ if response.code == 200: self.parse(task['url'], response.body) else: self._update_status(False) self.spider_queue.push({ "url": task['url'], "life": task['life'] - 1 }) log("[FAILED] %s %s" % (task['url'], response.code)) @gen.coroutine def get_page(self, task): """ 获取页面内容 :param task: :return: """ try: response = yield self.fetch(task['url']) except Exception as e: print('Exception: %s %s' % (e, task['url'])) raise gen.Return(e) raise gen.Return(response) @gen.coroutine def _run(self): @gen.coroutine def fetch_url(): current_task = yield self._queue.get() try: response = yield self.get_page(current_task) self.handle_response(current_task, response) # 从分布式队列中取出$(concurrency)个任务加入队列 for i in range(self.concurrency): if len(self.spider_queue) > 0: task = eval(self.spider_queue.pop()) if task['life'] > 0: yield self._queue.put(task) finally: self._queue.task_done() @gen.coroutine def worker(): while True: yield fetch_url() if len(self.spider_queue) > 0: self._update_on_job(True) # 加入首个任务 self._queue.put(eval(self.spider_queue.pop())) # 启动worker直到队列为空 for _ in range(self.concurrency): worker() yield self._queue.join(timeout=timedelta(seconds=300000)) else: self.wait_task_time += 1 if self.wait_task_time > 5: self._update_on_job(False) log("[SPIDER] Wait for some jobs...") time.sleep(3) def run(self, job): io_loop = ioloop.IOLoop.current() io_loop.run_sync(self._run)
def fetch(self): spider_queue = FIFOQueue(host=self.task_info['redis_host'], port=self.task_info['redis_port'], key=self.task_info['spider_queue_key']) task_queue = FIFOQueue(host=self.task_info['redis_host'], port=self.task_info['redis_port'], key=self.task_info['parser_queue_key']) crawler = SimpleCrawler(self.task_info['start_url'], self.task_info['allowed_domain']) dupefilter = SimpleDupefilter( host=self.task_info['redis_host'], port=self.task_info['redis_port'], key=self.task_info['spider_dupefilter_key']) pipeline = MongodbPipeline(self.task_info['db_host'], self.task_info['db_port'], self.task_info['db_name']) spider_queue.push(self.task_info['start_url']) # TODO shutdown signal while True: if len(spider_queue) > 0: current_url = spider_queue.pop() crawler.fetch(current_url) # if crawler successful fetch the content if crawler.success: item = crawler.parse() next_urls = item.get('links') next_urls_count = 0 for next_url in next_urls: if not dupefilter.exists(next_url): spider_queue.push(next_url) next_urls_count += 1 # print fetch infomation print "Crawler fetched %s and get %d urls" % ( current_url, next_urls_count) self.rpc_proxy.server.message( self.name, "Success fetched url %s." % current_url) item = pipeline.insert( item, self.task_info['spider_stored_table']) task_queue.push(item.get('_id')) self.rpc_proxy.server.message( self.name, "Stored url %s with ID %s." % (current_url, item.get('mongo_id'))) else: print "Wait for tasks..." time.sleep(3)