コード例 #1
0
ファイル: executors.py プロジェクト: JetMuffin/jetspider
    def collect(self):
        queue = FIFOQueue(
            self.task_info["redis_host"], self.task_info["redis_port"], self.task_info["parser_queue_key"]
        )
        pipeline = MongodbPipeline(self.task_info["db_host"], self.task_info["db_port"], self.task_info["db_name"])
        parser = JiebaParser()

        # TODO shutdown signal
        while True:
            if len(queue) > 0:
                page_id = queue.pop()
                item = pipeline.find(self.task_info["spider_stored_table"], page_id)
                terms = parser.segment(item["content"])
                terms_count = len(terms)

                # update item information to db
                item["terms"] = terms
                pipeline.update(self.task_info["spider_stored_table"], page_id, item)

                # connect to master
                self.rpc_proxy.server.message(self.name, "Parse page[%s] and get %d terms" % (page_id, terms_count))
                print ("Parse page[%s] and get %d terms" % (page_id, terms_count))

            else:
                print "Wait for tasks..."
                time.sleep(3)
コード例 #2
0
ファイル: executors.py プロジェクト: zion302/jetspider
    def collect(self):
        queue = FIFOQueue(self.task_info['redis_host'],
                          self.task_info['redis_port'],
                          self.task_info['parser_queue_key'])
        pipeline = MongodbPipeline(self.task_info['db_host'],
                                   self.task_info['db_port'],
                                   self.task_info['db_name'])
        parser = JiebaParser()

        # TODO shutdown signal
        while True:
            if len(queue) > 0:
                page_id = queue.pop()
                item = pipeline.find(self.task_info['spider_stored_table'],
                                     page_id)
                terms = parser.segment(item['content'])
                terms_count = len(terms)

                # update item information to db
                item['terms'] = terms
                pipeline.update(self.task_info["spider_stored_table"], page_id,
                                item)

                # connect to master
                self.rpc_proxy.server.message(
                    self.name,
                    "Parse page[%s] and get %d terms" % (page_id, terms_count))
                print("Parse page[%s] and get %d terms" %
                      (page_id, terms_count))

            else:
                print "Wait for tasks..."
                time.sleep(3)