Beispiel #1
0
    def parse_page3(self, response):
        item = NewsflowsItem()

        title = response.xpath(
            '//*[contains(concat( " ", @class, " " ), concat( " ", "article-headline", " " ))]/text()'
        )
        title = ''.join(title.extract())
        article = ''.join(
            response.xpath('//*[(@id = "article-text")]//p/text()').extract())
        if article == '':
            article = ''.join(response.xpath('//pre').extract())

        pTimestamp = ''.join(
            response.xpath(
                '//*[contains(concat( " ", @class, " " ), concat( " ", "timestamp", " " ))]'
                '/text()').extract())
        item['page1'] = response.meta['page1']
        item['page2'] = response.meta['page2']
        item['page3'] = socket.gethostname()

        item['category'] = 'category'
        item['title'] = title
        item['article'] = article
        item['pTimestamp'] = pTimestamp

        item['scrape_time'] = datetime.datetime.now()
        item['spider'] = self.name

        return item
	def parse_node(self, response, node):
		item = NewsflowsItem()
		item['page1'] = response.url
		item['page3'] = ''
		item['category'] = 'category'
		item['title'] = self.clean_string(node.xpath('title/text()')[0].extract())
		#item['article'] = node.xpath('description/text()').extract()[0]
		item['pTimestamp'] = node.xpath('pubDate/text()')[0].extract()
		item['scrape_time'] = datetime.datetime.now()
		item['spider'] = self.name
		url = node.xpath('link/text()').extract()[0].strip()
		yield Request(str(url), callback=self.parse_link,meta={'newsitem': item})
	def parse_node(self, response, node):
		item = NewsflowsItem()
		item['page1'] = response.url
		# >>>>>> Are these two just placeholders? <<<<<
		item['page3'] = ''
		item['category'] = 'category'
		item['title'] = self.clean_string(node.xpath('title/text()')[0].extract())
		#item['article'] = node.xpath('description/text()').extract()[0]
		item['pTimestamp'] = node.xpath('pubDate/text()')[0].extract()
		item['scrape_time'] = datetime.datetime.now()
		# >>>>>>>> Binding a class variable with an instance variable is tricky here. <<<<<<<<<<
		item['spider'] = self.name
		url = node.xpath('link/text()').extract()[0].strip()
		yield Request(str(url), callback=self.parse_link,meta={'newsitem': item})
Beispiel #4
0
    def parse_page3(self, response):
        item = NewsflowsItem()

        title = response.xpath(
            '//*[contains(concat( " ", @class, " " ), concat( " ", "pg-headline", " " ))]/text()'
        )
        title = ''.join(title.extract())

        article = ''.join(
            response.xpath(
                '//*[contains(concat( " ", @class, " " ), concat( " ", '
                '"zn-body__paragraph", " " ))]/text()').extract())

        pTimestamp = ''.join(
            response.xpath(
                '//*[contains(concat( " ", @class, " " ), concat( " ", "update-time", '
                '" " ))]/text()').extract())

        if title == '':
            title = response.xpath(
                '//*[contains(concat( " ", @class, " " ), concat( " ", "article-title", " " ))]/text()'
            )
            title = ''.join(title.extract())

            article = ''.join(
                response.xpath(
                    '//*[(@id = "storytext")]//p | //h2/text()').extract())

            pTimestamp = ''.join(
                response.xpath(
                    '//*[contains(concat( " ", @class, " " ), '
                    'concat( " ", "byline-timestamp", " " ))]//*'
                    '[contains(concat( " ", @class, " " ), '
                    'concat( " ", "cnnDateStamp", " " ))]/text()').extract())

        item['page1'] = response.meta['page1']
        item['page2'] = response.meta['page2']
        item['page3'] = socket.gethostname()

        item['category'] = 'category'
        item['title'] = title
        item['article'] = article
        item['pTimestamp'] = pTimestamp

        item['scrape_time'] = datetime.datetime.now()
        item['spider'] = self.name

        return item