def parse_page3(self, response): item = NewsflowsItem() title = response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", "article-headline", " " ))]/text()' ) title = ''.join(title.extract()) article = ''.join( response.xpath('//*[(@id = "article-text")]//p/text()').extract()) if article == '': article = ''.join(response.xpath('//pre').extract()) pTimestamp = ''.join( response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", "timestamp", " " ))]' '/text()').extract()) item['page1'] = response.meta['page1'] item['page2'] = response.meta['page2'] item['page3'] = socket.gethostname() item['category'] = 'category' item['title'] = title item['article'] = article item['pTimestamp'] = pTimestamp item['scrape_time'] = datetime.datetime.now() item['spider'] = self.name return item
def parse_node(self, response, node): item = NewsflowsItem() item['page1'] = response.url item['page3'] = '' item['category'] = 'category' item['title'] = self.clean_string(node.xpath('title/text()')[0].extract()) #item['article'] = node.xpath('description/text()').extract()[0] item['pTimestamp'] = node.xpath('pubDate/text()')[0].extract() item['scrape_time'] = datetime.datetime.now() item['spider'] = self.name url = node.xpath('link/text()').extract()[0].strip() yield Request(str(url), callback=self.parse_link,meta={'newsitem': item})
def parse_node(self, response, node): item = NewsflowsItem() item['page1'] = response.url # >>>>>> Are these two just placeholders? <<<<< item['page3'] = '' item['category'] = 'category' item['title'] = self.clean_string(node.xpath('title/text()')[0].extract()) #item['article'] = node.xpath('description/text()').extract()[0] item['pTimestamp'] = node.xpath('pubDate/text()')[0].extract() item['scrape_time'] = datetime.datetime.now() # >>>>>>>> Binding a class variable with an instance variable is tricky here. <<<<<<<<<< item['spider'] = self.name url = node.xpath('link/text()').extract()[0].strip() yield Request(str(url), callback=self.parse_link,meta={'newsitem': item})
def parse_page3(self, response): item = NewsflowsItem() title = response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", "pg-headline", " " ))]/text()' ) title = ''.join(title.extract()) article = ''.join( response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", ' '"zn-body__paragraph", " " ))]/text()').extract()) pTimestamp = ''.join( response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", "update-time", ' '" " ))]/text()').extract()) if title == '': title = response.xpath( '//*[contains(concat( " ", @class, " " ), concat( " ", "article-title", " " ))]/text()' ) title = ''.join(title.extract()) article = ''.join( response.xpath( '//*[(@id = "storytext")]//p | //h2/text()').extract()) pTimestamp = ''.join( response.xpath( '//*[contains(concat( " ", @class, " " ), ' 'concat( " ", "byline-timestamp", " " ))]//*' '[contains(concat( " ", @class, " " ), ' 'concat( " ", "cnnDateStamp", " " ))]/text()').extract()) item['page1'] = response.meta['page1'] item['page2'] = response.meta['page2'] item['page3'] = socket.gethostname() item['category'] = 'category' item['title'] = title item['article'] = article item['pTimestamp'] = pTimestamp item['scrape_time'] = datetime.datetime.now() item['spider'] = self.name return item