def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        title = hxs.select(
            '//meta[@property="og:title"]/@content').extract_first()
        body = [
            s.strip() for s in hxs.select(
                '//div[@class="article__body"]//p//text()').extract()
        ]
        time = hxs.select(
            '//meta[@itemprop="datePublished"]/@content').extract_first()

        if body:
            if time.find('2016') == -1:
                return
            else:
                item = OpenpoliticsItem()
                item['title'] = title
                item['text'] = body
                item['url'] = response.url
                if time:
                    item['date'] = dateutil.parser.parse(time)
                else:
                    item['time'] = hxs.select(
                        '//div[@class="akt_bar"]/span/text()').extract()
                # item['time'] = time
                item['i'] = 10

                return item
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        title = hxs.select(
            '//h1[@itemprop="headline name"]//text()[not(ancestor::script|ancestor::style|ancestor::noscript)]'
        ).extract_first()
        if title:
            title = title.strip()
        body = [
            s.strip() for s in hxs.select(
                '//span[@itemprop="articleBody"]//text()[not(ancestor::script|ancestor::style|ancestor::noscript|ancestor::h1)]'
            ).extract()
        ]
        time = hxs.select('//meta[@property="article:published_time"]/@content'
                          ).extract_first()

        if body and time:
            if time.find('2016') == -1:
                return
            else:
                item = OpenpoliticsItem()
                item['title'] = title
                item['text'] = body
                item['url'] = response.url
                item['date'] = dateutil.parser.parse(time)
                item['i'] = 1

                return item
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        title = hxs.select(
            '//meta[@property="og:title"]/@content').extract_first()
        body = [
            s.strip() for s in hxs.select(
                '//article[@id="item-article"]//p//text()[not('
                'ancestor::script|ancestor::style|ancestor::noscript)]').
            extract()
        ]
        time = hxs.select('//article[@id="item-article"]/header/time/@datetime'
                          ).extract_first()
        if not time:
            time = hxs.select(
                '//header[@id="page-header"]/time/@datetime').extract_first()

        if body:
            if time.find('2016') == -1:
                print time
            else:
                item = OpenpoliticsItem()
                item['title'] = title
                item['text'] = body
                item['url'] = response.url
                if time:
                    item['date'] = dateutil.parser.parse(time)
                # else:
                #     item['time'] = time
                item['i'] = 7

                return item
Exemple #4
0
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        title = hxs.select('//h1[@class="art-title"]/text()').extract_first()
        body = [
            s.strip() for s in hxs.select(
                '//section[@class="art_content"]//p//text()').extract()
        ]
        if not body:
            body = [
                s.strip()
                for s in hxs.select('//div[@id="artykul"]//text()').extract()
            ]

        time = hxs.select(
            '//time[@class="art-datetime"]/@datetime').extract_first()
        # if time:
        #     print time

        if body:
            item = OpenpoliticsItem()
            item['title'] = title
            item['text'] = body
            item['url'] = response.url
            # if time:
            # item['date'] = dateutil.parser.parse(time)
            item['time'] = time
            item['i'] = 9

            return item
Exemple #5
0
 def parse_page(self, response):
     hxs = HtmlXPathSelector(response)
     title = hxs.select('//span[@class="article-heading__title"]/text()').extract_first()
     body = hxs.select('//section[@class="article-page"]/p//text()').extract()
     item = OpenpoliticsItem()
     item['title'] = title
     item['body'] = body
     item['url'] = response.url
     # item['simhash'] = str(Simhash(body))
     return item
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        title = hxs.select('//meta[@property="og:title"]/@content').extract_first()
        body = [s.strip() for s in hxs.select('//div[@class="text"]//p//text()').extract()]
        time = hxs.select('//meta[@property="article:published_time"]/@content').extract_first()

        if body:
            if time.find('2016') == -1:
                print time
            else:
                item = OpenpoliticsItem()
                item['title'] = title
                item['text'] = body
                item['url'] = response.url
                if time:
                    item['date'] = dateutil.parser.parse(time)
                item['i'] = 2

                return item
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        title = hxs.select('//meta[@property="og:title"]/@content').extract_first()
        body = [s.strip() for s in hxs.select('//div[@id="intext_content_tag"]//div//text()').extract()]
        time = hxs.select('//div[@class="article_info"]/text()').extract_first()
        if time:
            time = time.strip()

        if body and time:
            if time.find('2016') == -1:
                return
            else:
                item = OpenpoliticsItem()
                item['title'] = title
                item['text'] = body
                item['url'] = response.url
                # if time:
                # item['date'] = dateutil.parser.parse(time)
                item['time'] = time
                item['i'] = 8

                return item
Exemple #8
0
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        title = hxs.select(
            '//meta[@property="og:title"]/@content').extract_first()
        body = [
            s.strip() for s in hxs.select(
                '//div[@class="FAZArtikelText"]//p//text()').extract()
        ]
        time = hxs.select('//span[@class="Datum"]/@content').extract_first()

        if body and time:
            if time.find('2016') == -1:
                return
            else:
                item = OpenpoliticsItem()
                item['title'] = title
                item['text'] = body
                item['url'] = response.url
                # if time:
                item['time'] = time
                item['i'] = 5

                return item
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        title = hxs.select(
            '//meta[@property="og:title"]/@content').extract_first()
        body = [
            s.strip() for s in hxs.select(
                '//section[@class="body"]//p//text()').extract()
        ]
        time = hxs.select(
            '//time[@class="timeformat"]/@datetime').extract_first()

        if body and time:
            if time.find('2016') == -1:
                return
            else:
                item = OpenpoliticsItem()
                item['title'] = title
                item['text'] = body
                item['url'] = response.url
                item['date'] = dateutil.parser.parse(time)
                item['i'] = 4

                return item
Exemple #10
0
    def parse_page(self, response):
        hxs = HtmlXPathSelector(response)
        title = hxs.select('//span[@class="headline"]//text()').extract_first()
        if title:
            title = title.strip()
        body = [
            s.strip()
            for s in hxs.select('//div[@class="txt"]/p//text()').extract()
        ]
        time = hxs.select(
            '//div[@class="authors"]//time/@datetime').extract_first()

        if body and time:
            if time.find('2016') == -1:
                return
            else:
                item = OpenpoliticsItem()
                item['title'] = title
                item['text'] = body
                item['url'] = response.url
                item['date'] = dateutil.parser.parse(time)
                item['i'] = 0

                return item