Ejemplo n.º 1
0
    def collect(self):
        queue = FIFOQueue(self.task_info['redis_host'],
                          self.task_info['redis_port'],
                          self.task_info['parser_queue_key'])
        pipeline = MongodbPipeline(self.task_info['db_host'],
                                   self.task_info['db_port'],
                                   self.task_info['db_name'])
        parser = JiebaParser()

        # TODO shutdown signal
        while True:
            if len(queue) > 0:
                page_id = queue.pop()
                item = pipeline.find(self.task_info['spider_stored_table'],
                                     page_id)
                terms = parser.segment(item['content'])
                terms_count = len(terms)

                # update item information to db
                item['terms'] = terms
                pipeline.update(self.task_info["spider_stored_table"], page_id,
                                item)

                # connect to master
                self.rpc_proxy.server.message(
                    self.name,
                    "Parse page[%s] and get %d terms" % (page_id, terms_count))
                print("Parse page[%s] and get %d terms" %
                      (page_id, terms_count))

            else:
                print "Wait for tasks..."
                time.sleep(3)
Ejemplo n.º 2
0
    def collect(self):
        queue = FIFOQueue(
            self.task_info["redis_host"], self.task_info["redis_port"], self.task_info["parser_queue_key"]
        )
        pipeline = MongodbPipeline(self.task_info["db_host"], self.task_info["db_port"], self.task_info["db_name"])
        parser = JiebaParser()

        # TODO shutdown signal
        while True:
            if len(queue) > 0:
                page_id = queue.pop()
                item = pipeline.find(self.task_info["spider_stored_table"], page_id)
                terms = parser.segment(item["content"])
                terms_count = len(terms)

                # update item information to db
                item["terms"] = terms
                pipeline.update(self.task_info["spider_stored_table"], page_id, item)

                # connect to master
                self.rpc_proxy.server.message(self.name, "Parse page[%s] and get %d terms" % (page_id, terms_count))
                print ("Parse page[%s] and get %d terms" % (page_id, terms_count))

            else:
                print "Wait for tasks..."
                time.sleep(3)
Ejemplo n.º 3
0
 def __init__(self, master='127.0.0.1:2181', type='spider'):
     """
     spider类型执行器
     :param master: 主节点地址
     :param type: 执行器类型
     :return:
     """
     Worker.__init__(self, master, type)
     # 注册任务队列
     self.spider_queue = FIFOQueue(self.redis, self.config.get("spider_queue"))
     self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue"))
     # 注册过滤器
     self.duplicate_filter = DuplicateFilter(self.redis, self.config.get("duplicate_set"))
     # 注册存储数据库
     self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db"))
Ejemplo n.º 4
0
class ProcessorWorker(Worker):
    def __init__(self, master='127.0.0.1:2181', type='spider'):
        """
        处理执行器
        :param master: 主节点地址
        :param type: 执行器类型
        :return:
        """
        Worker.__init__(self, master, type)

        # 注册任务队列
        self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue"))
        # 注册存储数据库
        self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db"))

    def run(self, job):
        # 注册文章处理器
        processor = DocumentProcessor()

        while len(self.processer_queue) > 0:
            self._update_on_job(True)
            page_id = self.processer_queue.pop()
            page = self.storage_pipline.find(self.config.get("page_table"), page_id)
            self.task = page
            terms = processor.process(page)

            # 将page_table抽取的信息存入doc_table
            page['terms'] = terms
            self.storage_pipline.update(self.config.get("page_table"), page_id, page)

            self._update_status(True)
            log("[SUCCESS] %s" % page['href'])

        else:
            self.wait_task_time += 1
            self.task = None
            if self.wait_task_time > 5:
                self._update_on_job(False)
            log("[PROCESSOR] Wait for some jobs...")
            time.sleep(3)

    def task_stop(self):
        if self.task:
            self.processer_queue.push(self.task)
Ejemplo n.º 5
0
 def __init__(self, master='127.0.0.1:2181', type='spider', concurrency=5, **kwargs):
     """
     异步爬虫执行器
     :param master: 主节点地址
     :param type: 执行器类型
     :param concurrency: 并发数
     :param kwargs:
     :return:
     """
     Worker.__init__(self, master, type)
     # 注册任务队列
     self.spider_queue = FIFOQueue(self.redis, self.config.get("spider_queue"))
     self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue"))
     # 注册过滤器
     self.duplicate_filter = DuplicateFilter(self.redis, self.config.get("duplicate_set"))
     # 注册存储数据库
     self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db"))
     # 并发线程数
     self.concurrency = concurrency
     # 内部队列
     self._queue = queues.Queue()
Ejemplo n.º 6
0
    def fetch(self):
        spider_queue = FIFOQueue(
            host=self.task_info["redis_host"], port=self.task_info["redis_port"], key=self.task_info["spider_queue_key"]
        )
        task_queue = FIFOQueue(
            host=self.task_info["redis_host"], port=self.task_info["redis_port"], key=self.task_info["parser_queue_key"]
        )
        crawler = SimpleCrawler(self.task_info["start_url"], self.task_info["allowed_domain"])
        dupefilter = SimpleDupefilter(
            host=self.task_info["redis_host"],
            port=self.task_info["redis_port"],
            key=self.task_info["spider_dupefilter_key"],
        )
        pipeline = MongodbPipeline(self.task_info["db_host"], self.task_info["db_port"], self.task_info["db_name"])
        spider_queue.push(self.task_info["start_url"])

        # TODO shutdown signal
        while True:
            if len(spider_queue) > 0:
                current_url = spider_queue.pop()
                crawler.fetch(current_url)

                # if crawler successful fetch the content
                if crawler.success:
                    item = crawler.parse()
                    next_urls = item.get("links")
                    next_urls_count = 0
                    for next_url in next_urls:
                        if not dupefilter.exists(next_url):
                            spider_queue.push(next_url)
                            next_urls_count += 1

                    # print fetch infomation
                    print "Crawler fetched %s and get %d urls" % (current_url, next_urls_count)
                    self.rpc_proxy.server.message(self.name, "Success fetched url %s." % current_url)

                    item = pipeline.insert(item, self.task_info["spider_stored_table"])
                    task_queue.push(item.get("_id"))
                    self.rpc_proxy.server.message(
                        self.name, "Stored url %s with ID %s." % (current_url, item.get("mongo_id"))
                    )

            else:
                print "Wait for tasks..."
                time.sleep(3)
Ejemplo n.º 7
0
class SpiderWorker(Worker):
    def __init__(self, master='127.0.0.1:2181', type='spider'):
        """
        spider类型执行器
        :param master: 主节点地址
        :param type: 执行器类型
        :return:
        """
        Worker.__init__(self, master, type)
        # 注册任务队列
        self.spider_queue = FIFOQueue(self.redis, self.config.get("spider_queue"))
        self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue"))
        # 注册过滤器
        self.duplicate_filter = DuplicateFilter(self.redis, self.config.get("duplicate_set"))
        # 注册存储数据库
        self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db"))

    def run(self, job):
        """
        执行方法
        :param job: 任务信息
        :return:
        """
        # 注册爬虫
        crawler = PhantomCrawler()
        parser = NormalParser(job)

        if len(self.spider_queue) > 0:
            task = eval(self.spider_queue.pop())
            self.task = task

            # 若该任务失败次数过多,不再处理该任务
            if task['life'] == 0:
                return

            response = crawler.fetch(task['url'])
            # success, result = crawler.fetch(task['url'])

            # 若爬虫成功爬取
            if response['status_code'] == 200:
                try:
                    item = parser.parse(task['url'], response['content'])
                    # 分片写入
                    item['ram'] = random.random()
                    new_urls = item['links']

                    # 抓去的新链接判重后加入队列
                    for new_url in new_urls:
                        if not self.duplicate_filter.exists(new_url):
                            self.spider_queue.push({
                                "url": new_url,
                                "life": 5
                            })

                    # url原始解析结果持久化
                    item = self.storage_pipline.insert(self.config.get("page_table"), item)
                    self.processer_queue.push(item.get('_id'))

                    # 更新任务状态
                    self._update_status(True)
                    log("[SUCCESS] %s." % task['url'])
                except Exception, e:
                    # 将失败的url再次放入队列
                    self.spider_queue.push({
                        "url": task['url'],
                        "life": task['life'] - 1
                    })
                    log("[FAILED] %s %s" % (task['url'], e))
            else:
                # 更新任务状态
                self._update_status(False)

                # 将失败的url再次放入队列
                self.spider_queue.push({
                    "url": task['url'],
                    "life": task['life'] - 1
                })
                log("[FAILED] %s %s" % (task['url'], response['status_code']))

        else:
Ejemplo n.º 8
0
class AsyncSpiderWorker(Worker):
    def __init__(self, master='127.0.0.1:2181', type='spider', concurrency=5, **kwargs):
        """
        异步爬虫执行器
        :param master: 主节点地址
        :param type: 执行器类型
        :param concurrency: 并发数
        :param kwargs:
        :return:
        """
        Worker.__init__(self, master, type)
        # 注册任务队列
        self.spider_queue = FIFOQueue(self.redis, self.config.get("spider_queue"))
        self.processer_queue = FIFOQueue(self.redis, self.config.get("processor_queue"))
        # 注册过滤器
        self.duplicate_filter = DuplicateFilter(self.redis, self.config.get("duplicate_set"))
        # 注册存储数据库
        self.storage_pipline = MongodbStorage(self.mongodb, self.config.get("storage_db"))
        # 并发线程数
        self.concurrency = concurrency
        # 内部队列
        self._queue = queues.Queue()

    def fetch(self, url, **kwargs):
        fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
        return fetch(url, raise_error=False, **kwargs)

    def parse(self, url, html):
        """ 解析html页面 """
        parser = NormalParser(url, html, self.job)
        item = parser.parse()
        # 分片写入
        item['ram'] = random.random()
        new_urls = item.get('links')

        # 抓去的新链接判重后加入队列
        for new_url in new_urls:
            if not self.duplicate_filter.exists(new_url):
                self.spider_queue.push({
                    "url": new_url,
                    "life": 5
                })

        # url原始解析结果持久化
        item = self.storage_pipline.insert(self.config.get("page_table"), item)
        self.processer_queue.push(item.get('_id'))
        self._update_status(True)
        log("[SUCCESS] %s." % url)

    def handle_response(self, task, response):
        """ 处理异步返回 """
        if response.code == 200:
            self.parse(task['url'], response.body)

        else:
            self._update_status(False)
            self.spider_queue.push({
                "url": task['url'],
                "life": task['life'] - 1
            })
            log("[FAILED] %s %s" % (task['url'], response.code))

    @gen.coroutine
    def get_page(self, task):
        """
        获取页面内容
        :param task:
        :return:
        """
        try:
            response = yield self.fetch(task['url'])
        except Exception as e:
            print('Exception: %s %s' % (e, task['url']))
            raise gen.Return(e)
        raise gen.Return(response)

    @gen.coroutine
    def _run(self):
        @gen.coroutine
        def fetch_url():
            current_task = yield self._queue.get()
            try:
                response = yield self.get_page(current_task)
                self.handle_response(current_task, response)

                # 从分布式队列中取出$(concurrency)个任务加入队列
                for i in range(self.concurrency):
                    if len(self.spider_queue) > 0:
                        task = eval(self.spider_queue.pop())
                        if task['life'] > 0:
                            yield self._queue.put(task)

            finally:
                self._queue.task_done()

        @gen.coroutine
        def worker():
            while True:
                yield fetch_url()

        if len(self.spider_queue) > 0:
            self._update_on_job(True)
            # 加入首个任务
            self._queue.put(eval(self.spider_queue.pop()))

            # 启动worker直到队列为空
            for _ in range(self.concurrency):
                worker()

            yield self._queue.join(timeout=timedelta(seconds=300000))
        else:
            self.wait_task_time += 1
            if self.wait_task_time > 5:
                self._update_on_job(False)
            log("[SPIDER] Wait for some jobs...")
            time.sleep(3)

    def run(self, job):
        io_loop = ioloop.IOLoop.current()
        io_loop.run_sync(self._run)
Ejemplo n.º 9
0
    def fetch(self):
        spider_queue = FIFOQueue(host=self.task_info['redis_host'],
                                 port=self.task_info['redis_port'],
                                 key=self.task_info['spider_queue_key'])
        task_queue = FIFOQueue(host=self.task_info['redis_host'],
                               port=self.task_info['redis_port'],
                               key=self.task_info['parser_queue_key'])
        crawler = SimpleCrawler(self.task_info['start_url'],
                                self.task_info['allowed_domain'])
        dupefilter = SimpleDupefilter(
            host=self.task_info['redis_host'],
            port=self.task_info['redis_port'],
            key=self.task_info['spider_dupefilter_key'])
        pipeline = MongodbPipeline(self.task_info['db_host'],
                                   self.task_info['db_port'],
                                   self.task_info['db_name'])
        spider_queue.push(self.task_info['start_url'])

        # TODO shutdown signal
        while True:
            if len(spider_queue) > 0:
                current_url = spider_queue.pop()
                crawler.fetch(current_url)

                # if crawler successful fetch the content
                if crawler.success:
                    item = crawler.parse()
                    next_urls = item.get('links')
                    next_urls_count = 0
                    for next_url in next_urls:
                        if not dupefilter.exists(next_url):
                            spider_queue.push(next_url)
                            next_urls_count += 1

                    # print fetch infomation
                    print "Crawler fetched %s and get %d urls" % (
                        current_url, next_urls_count)
                    self.rpc_proxy.server.message(
                        self.name, "Success fetched url %s." % current_url)

                    item = pipeline.insert(
                        item, self.task_info['spider_stored_table'])
                    task_queue.push(item.get('_id'))
                    self.rpc_proxy.server.message(
                        self.name, "Stored url %s with ID %s." %
                        (current_url, item.get('mongo_id')))

            else:
                print "Wait for tasks..."
                time.sleep(3)