コード例 #1
0
ファイル: executors.py プロジェクト: zion302/jetspider
    def fetch(self):
        spider_queue = FIFOQueue(host=self.task_info['redis_host'],
                                 port=self.task_info['redis_port'],
                                 key=self.task_info['spider_queue_key'])
        task_queue = FIFOQueue(host=self.task_info['redis_host'],
                               port=self.task_info['redis_port'],
                               key=self.task_info['parser_queue_key'])
        crawler = SimpleCrawler(self.task_info['start_url'],
                                self.task_info['allowed_domain'])
        dupefilter = SimpleDupefilter(
            host=self.task_info['redis_host'],
            port=self.task_info['redis_port'],
            key=self.task_info['spider_dupefilter_key'])
        pipeline = MongodbPipeline(self.task_info['db_host'],
                                   self.task_info['db_port'],
                                   self.task_info['db_name'])
        spider_queue.push(self.task_info['start_url'])

        # TODO shutdown signal
        while True:
            if len(spider_queue) > 0:
                current_url = spider_queue.pop()
                crawler.fetch(current_url)

                # if crawler successful fetch the content
                if crawler.success:
                    item = crawler.parse()
                    next_urls = item.get('links')
                    next_urls_count = 0
                    for next_url in next_urls:
                        if not dupefilter.exists(next_url):
                            spider_queue.push(next_url)
                            next_urls_count += 1

                    # print fetch infomation
                    print "Crawler fetched %s and get %d urls" % (
                        current_url, next_urls_count)
                    self.rpc_proxy.server.message(
                        self.name, "Success fetched url %s." % current_url)

                    item = pipeline.insert(
                        item, self.task_info['spider_stored_table'])
                    task_queue.push(item.get('_id'))
                    self.rpc_proxy.server.message(
                        self.name, "Stored url %s with ID %s." %
                        (current_url, item.get('mongo_id')))

            else:
                print "Wait for tasks..."
                time.sleep(3)
コード例 #2
0
ファイル: executors.py プロジェクト: JetMuffin/jetspider
    def fetch(self):
        spider_queue = FIFOQueue(
            host=self.task_info["redis_host"], port=self.task_info["redis_port"], key=self.task_info["spider_queue_key"]
        )
        task_queue = FIFOQueue(
            host=self.task_info["redis_host"], port=self.task_info["redis_port"], key=self.task_info["parser_queue_key"]
        )
        crawler = SimpleCrawler(self.task_info["start_url"], self.task_info["allowed_domain"])
        dupefilter = SimpleDupefilter(
            host=self.task_info["redis_host"],
            port=self.task_info["redis_port"],
            key=self.task_info["spider_dupefilter_key"],
        )
        pipeline = MongodbPipeline(self.task_info["db_host"], self.task_info["db_port"], self.task_info["db_name"])
        spider_queue.push(self.task_info["start_url"])

        # TODO shutdown signal
        while True:
            if len(spider_queue) > 0:
                current_url = spider_queue.pop()
                crawler.fetch(current_url)

                # if crawler successful fetch the content
                if crawler.success:
                    item = crawler.parse()
                    next_urls = item.get("links")
                    next_urls_count = 0
                    for next_url in next_urls:
                        if not dupefilter.exists(next_url):
                            spider_queue.push(next_url)
                            next_urls_count += 1

                    # print fetch infomation
                    print "Crawler fetched %s and get %d urls" % (current_url, next_urls_count)
                    self.rpc_proxy.server.message(self.name, "Success fetched url %s." % current_url)

                    item = pipeline.insert(item, self.task_info["spider_stored_table"])
                    task_queue.push(item.get("_id"))
                    self.rpc_proxy.server.message(
                        self.name, "Stored url %s with ID %s." % (current_url, item.get("mongo_id"))
                    )

            else:
                print "Wait for tasks..."
                time.sleep(3)