Beispiel #1
0
    def parse(self, response):
        """ 列表页解析 """

        last_md5 = ''
        if self.isFirstListPage:
            checkText = self.safeParse(response, self.checkTxtXpath)
            last_md5 = toMd5(checkText)

        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" % (last_md5, self.last_md5))
        if self.isFirstListPage and last_md5 == self.last_md5:
            yield []
        else:
            for request in self.getDetailPageUrls(response):
                yield request

            # 获取下一列表页url
            if not self.isDone:
                for request in self.getNextListPageUrl(response):
                    yield request

            # 同步md5码 & 同步last_id
            if self.isFirstListPage:
                syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})

        self.isFirstListPage = False
Beispiel #2
0
    def parse(self, response):
        """ 列表页解析 """

        last_md5 = ''
        if self.isFirstListPage:
            checkText = self.safeParse(response, self.checkTxtXpath)
            last_md5 = toMd5(checkText)

        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" %
                     (last_md5, self.last_md5))
        if (
                not self.is_duplicate
        ) and OPEN_MD5_CHECK and self.isFirstListPage and last_md5 == self.last_md5:
            yield []
        else:
            for request in self.getDetailPageUrls(response):
                yield request

            # 获取下一列表页url
            if not self.isDone:
                for request in self.getNextListPageUrl(response):
                    yield request

            # 同步md5码 & 同步last_id
            if self.isFirstListPage:
                syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})

        self.isFirstListPage = False
Beispiel #3
0
    def run(self, config):

        self.initConfig(config)
        d = feedparser.parse(config.get('start_urls', '')[0])

        # md5校验
        last_md5 = toMd5(d.entries)
        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" % (last_md5, self.last_md5))
        if OPEN_MD5_CHECK and self.last_md5 == last_md5:
            return True

        self.parse(d)  # 解析rss
        syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})
Beispiel #4
0
    def run(self, config):

        self.initConfig(config)
        d = feedparser.parse(config.get('start_urls', '')[0])

        # md5校验
        last_md5 = toMd5(d.entries)
        logging.info("*********last_md5 : %s   self.last_md5 : %s*****" %
                     (last_md5, self.last_md5))
        if OPEN_MD5_CHECK and self.last_md5 == last_md5:
            return True

        self.parse(d)  # 解析rss
        syncLastMd5({'last_md5': last_md5, 'id': self.rule_id})