def parse_node(self, response): self.currentNode = response # logging.info("*********meta******%s****************" % response.meta['spiderConfig']) self.initConfig(response.meta['spiderConfig']) checkText = self.safeParse(self.checkTxtXpath) last_md5 = toMd5(checkText) if last_md5 == response.meta['spiderConfig'].get('last_md5', ''): yield [] else: item = XmlFeedItem() item['title'] = [t.encode('utf-8') for t in self.safeParse(self.titleXpath)] imageAndDescriptionInfos = self.parseDescriptionAndImages() item['img_url'] = imageAndDescriptionInfos['img_url'] item['description'] = imageAndDescriptionInfos['description'] item['public_time'] = [p.encode('utf-8') for p in self.safeParse(self.pubDateXpath)] item['source_url'] = [g.encode('utf-8') for g in self.safeParse(self.guidXpath)] item['rule_id'] = self.rule_id yield item # update md5 to mysql spiderConfig = getCrawlNoRssRequest({'last_md5': last_md5, 'id': self.rule_id}) if spiderConfig: yield Request(spiderConfig.get('start_urls', '')[0], headers={'Referer': 'http://www.google.com'}, meta={'spiderConfig': spiderConfig}, callback=self.parse_node, dont_filter=True)
def start_requests(self): spiderConfig = getCrawlNoRssRequest() if not spiderConfig: return [] self.initConfig(spiderConfig) logging.info("*********meta******%s****************" % spiderConfig) return [Request(spiderConfig.get('start_urls', '')[0], callback=self.parse, dont_filter=True)]
def start_requests(self): requestUrl = [] for i in xrange(0, MAX_START_URLS_NUM): spiderConfig = getCrawlNoRssRequest() if not spiderConfig: break requestUrl.append(Request(spiderConfig.get('start_urls', '')[0], meta={'spiderConfig': spiderConfig}, callback=self.parse_node, dont_filter=True)) return requestUrl