Esempio n. 1
0
    def parse(self, url, html):
        """ 解析html页面 """
        parser = NormalParser(url, html, self.job)
        item = parser.parse()
        # 分片写入
        item['ram'] = random.random()
        new_urls = item.get('links')

        # 抓去的新链接判重后加入队列
        for new_url in new_urls:
            if not self.duplicate_filter.exists(new_url):
                self.spider_queue.push({
                    "url": new_url,
                    "life": 5
                })

        # url原始解析结果持久化
        item = self.storage_pipline.insert(self.config.get("page_table"), item)
        self.processer_queue.push(item.get('_id'))
        self._update_status(True)
        log("[SUCCESS] %s." % url)
Esempio n. 2
0
 def parse(self, url, html):
     parser = NormalParser(url, html, self.job)
     return parser.parse()
Esempio n. 3
0
    def run(self, job):
        """
        执行方法
        :param job: 任务信息
        :return:
        """
        # 注册爬虫
        crawler = PhantomCrawler()
        parser = NormalParser(job)

        if len(self.spider_queue) > 0:
            task = eval(self.spider_queue.pop())
            self.task = task

            # 若该任务失败次数过多,不再处理该任务
            if task['life'] == 0:
                return

            response = crawler.fetch(task['url'])
            # success, result = crawler.fetch(task['url'])

            # 若爬虫成功爬取
            if response['status_code'] == 200:
                try:
                    item = parser.parse(task['url'], response['content'])
                    # 分片写入
                    item['ram'] = random.random()
                    new_urls = item['links']

                    # 抓去的新链接判重后加入队列
                    for new_url in new_urls:
                        if not self.duplicate_filter.exists(new_url):
                            self.spider_queue.push({
                                "url": new_url,
                                "life": 5
                            })

                    # url原始解析结果持久化
                    item = self.storage_pipline.insert(self.config.get("page_table"), item)
                    self.processer_queue.push(item.get('_id'))

                    # 更新任务状态
                    self._update_status(True)
                    log("[SUCCESS] %s." % task['url'])
                except Exception, e:
                    # 将失败的url再次放入队列
                    self.spider_queue.push({
                        "url": task['url'],
                        "life": task['life'] - 1
                    })
                    log("[FAILED] %s %s" % (task['url'], e))
            else:
                # 更新任务状态
                self._update_status(False)

                # 将失败的url再次放入队列
                self.spider_queue.push({
                    "url": task['url'],
                    "life": task['life'] - 1
                })
                log("[FAILED] %s %s" % (task['url'], response['status_code']))