Ejemplo n.º 1
0
    def parse_node(self, response):

        self.currentNode = response
        # logging.info("*********meta******%s****************" % response.meta['spiderConfig'])
        self.initConfig(response.meta['spiderConfig'])

        checkText = self.safeParse(self.checkTxtXpath)
        last_md5 = toMd5(checkText)
        if last_md5 == response.meta['spiderConfig'].get('last_md5', ''):
            yield []
        else:
            item = XmlFeedItem()
            item['title'] = [t.encode('utf-8') for t in self.safeParse(self.titleXpath)]

            imageAndDescriptionInfos = self.parseDescriptionAndImages()
            item['img_url'] = imageAndDescriptionInfos['img_url']
            item['description'] = imageAndDescriptionInfos['description']

            item['public_time'] = [p.encode('utf-8') for p in self.safeParse(self.pubDateXpath)]
            item['source_url'] = [g.encode('utf-8') for g in self.safeParse(self.guidXpath)]
            item['rule_id'] = self.rule_id
            yield item

            # update md5 to mysql
            spiderConfig = getCrawlNoRssRequest({'last_md5': last_md5, 'id': self.rule_id})
            if spiderConfig:
                yield Request(spiderConfig.get('start_urls', '')[0],
                              headers={'Referer': 'http://www.google.com'},
                              meta={'spiderConfig': spiderConfig},
                              callback=self.parse_node,
                              dont_filter=True)
Ejemplo n.º 2
0
    def start_requests(self):

        spiderConfig = getCrawlNoRssRequest()
        if not spiderConfig:
            return []

        self.initConfig(spiderConfig)
        logging.info("*********meta******%s****************" % spiderConfig)
        return [Request(spiderConfig.get('start_urls', '')[0], callback=self.parse, dont_filter=True)]
Ejemplo n.º 3
0
    def start_requests(self):

        spiderConfig = getCrawlNoRssRequest()
        if not spiderConfig:
            return []

        self.initConfig(spiderConfig)
        logging.info("*********meta******%s****************" % spiderConfig)
        return [Request(spiderConfig.get('start_urls', '')[0], callback=self.parse, dont_filter=True)]
Ejemplo n.º 4
0
    def start_requests(self):
        requestUrl = []
        for i in xrange(0, MAX_START_URLS_NUM):
            spiderConfig = getCrawlNoRssRequest()
            if not spiderConfig:
                break

            requestUrl.append(Request(spiderConfig.get('start_urls', '')[0],
                                      meta={'spiderConfig': spiderConfig},
                                      callback=self.parse_node,
                                      dont_filter=True))
        return requestUrl