Example #1
0
    def parse_page(self, response):
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(
            s, '//*[contains(@class, "print") or contains(@class, "hidden")]'
        )  # Physical print only

        l = NewsLoader(selector=s)

        # Remove referer params from end of URLs
        l.add_xpath('url', 'head/link[@rel="canonical"]/@href')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath(
            'bodytext',
            '//div[@data-print="body"]/*[not(contains(@class, "user-bio") or contains(@class, "_shares") or contains(@class, "inline-promo"))]//text()'
        )
        l.add_xpath('bodytext',
                    '//div[contains(@class, "_item_text")]//text()')
        l.add_xpath(
            'bodytext', '//article//*[contains(@class, "subbuzz-text") or '
            'contains(@class, "subbuzz__title")]//text()')

        return l.load_item()
Example #2
0
    def parse_page(self, response):
        """@url http://www.mirror.co.uk/news/uk-news/lesbian-couple-who-launched-crowdfunding-9902318
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(s, '//form')
        mutate_selector_del_xpath(
            s, '//aside[contains(@class,"read-more-links")]')

        l = NewsLoader(selector=s)

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_css('bodytext', '.article-body ::text')  # Live

        return l.load_item()
 def parse_page(self, response):
     s = response.selector
     mutate_selector_del_xpath(s, '//script')
     mutate_selector_del_xpath(s, '//*[@style="display:none"]')
     l = NewsLoader(selector=s)
     l.add_fromresponse(response)
     l.add_htmlmeta()
     l.add_schemaorg(response)
     l.add_opengraph()
     l.add_readability(response)
     item = l.load_item()
     return item
Example #4
0
    def parse_page(self, response):
        """@url https://www.thesun.co.uk/living/2937147/human-ken-doll-quentin-dehar-who-spent-92k-to-look-like-his-idol-has-dumped-his-surgery-obsessed-barbie-girlfriend-for-dying-her-hair-brown/
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """

        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.

        # TODO: 'Keywords' and 'tags' for The Sun are different. Decide which
        #       which we want.

        # Lose "The Sun" link on the bottom of each page
        mutate_selector_del_xpath(
            s, '//div[contains(@class, "social--fb-page-button")]')
        # Lose the "related articles" carousel
        mutate_selector_del_xpath(s,
                                  '//div[contains(@class, "rail--trending")]')

        l = NewsLoader(selector=s)

        l.add_xpath('summary', 'meta[@name="description"]/@content')

        # TODO: This is kinda grot. Fine except for names like "John da Silva".
        l.add_xpath(
            'bylines',
            '//span[contains(@class, "article__author-name")]//text()',
            lambda x: (s.title() for s in x))

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath(
            'bodytext',
            '//article//div[contains(@class, "article__content")]//text()')

        return l.load_item()
Example #5
0
    def parse_page(self, response):
        """Note: firstpubtime also fetched, but via RSS feed (which can't be
                 contracted for)

        @url https://www.buzzfeed.com/maryanngeorgantopoulos/white-supremacists-are-spreading-their-message-on-college-ca
        @returns items 1
        @scrapes bodytext bylines fetchtime headline
        @scrapes section source summary url keywords language
        @noscrapes modtime
        """

        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(
            s, '//*[contains(@class, "print") or contains(@class, "hidden")]'
        )  # Physical print only

        l = NewsLoader(selector=s)

        # Remove referer params from end of URLs
        l.add_xpath('url', 'head/link[@rel="canonical"]/@href')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath(
            'bodytext',
            '//div[@data-print="body"]/*[not(contains(@class, "user-bio") or contains(@class, "_shares") or contains(@class, "inline-promo"))]//text()'
        )
        l.add_xpath('bodytext',
                    '//div[contains(@class, "_item_text")]//text()')
        l.add_xpath(
            'bodytext', '//article//*[contains(@class, "subbuzz-text") or '
            'contains(@class, "subbuzz__title")]//text()')

        return l.load_item()
Example #6
0
    def parse_page(self, response):
        """@url http://edition.cnn.com/2017/03/01/politics/joe-biden-hunter-beau/index.html
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(
            s, '//div[contains(@class, "read-more-button")]')
        mutate_selector_del_xpath(s, '//div[contains(@class, "el__embedded")]')
        mutate_selector_del_xpath(s, '//div[contains(@class, "owl-carousel")]')

        l = NewsLoader(selector=s)

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath(
            'headline',
            '//article//meta[@itemprop="alternativeHeadline"]/@content')
        l.add_xpath('headline', '//h1[contains(@class, "headline")]/text()')

        return l.load_item()
Example #7
0
    def parse_page(self, response):
        """@url http://www.dailymail.co.uk/news/article-4242322/Milo-Yiannopoulos-BANNED-CPAC-conference.html?ITO=1490
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """

        s = response.selector
        # Remove some content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(s, '//script')
        mutate_selector_del_xpath(s, '//*[@style="display:none"]')
        mutate_selector_del_xpath(
            s, '//div[contains(@class, "related-carousel")]')

        l = NewsLoader(selector=s)

        # Get alternative to RSS-source URL fluff
        l.add_xpath('url', 'head/link[@rel="canonical"]/@href')
        l.add_value(
            'bylines',
            list(
                map(
                    lambda x: x.get(),
                    response.xpath(
                        "//span[@data-component='Byline']//span[@data-component='Text']//a/text()"
                    ))))
        l.add_value('headline',
                    response.xpath("//h1[@data-component='Headline']/text()"))
        bodytext = ""
        for r in response.xpath('//div[@id="body"]//p//text()'):
            bodytext += r.extract()
        l.add_value('bodytext', bodytext)
        #l.add_xpath('keywords',
        #               'head/meta[@property="keywords"]/@content')
        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_scrapymeta(response)
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_readability(response)

        # TODO: JS dross in body; might need a standard solution to keep this
        #       out.
        # TODO: Related article dross in body. <div class=related-carousel>

        item = l.load_item()

        #        self.logger.debug('bodytext', item['bodytext'])

        return item
Example #8
0
    def parse_page(self, response):
        """@url http://www.usatoday.com/story/money/markets/2017/02/28/bonds-telling-less-bullish-tale-than-stocks/98503646/
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(
            s, '//*[contains(@class, "inline-share-tools")]')
        mutate_selector_del_xpath(
            s, '//*[contains(@class, "article-print-url")]')
        mutate_selector_del_xpath(s, '//aside')

        l = NewsLoader(selector=s)

        l.add_xpath('bylines',
                    'head/meta[@name="cXenseParse:author"]/@content')
        # Section metadata comes out as "news,world". For this, take "News".
        l.add_xpath(
            'section', 'head/meta[@itemprop="articleSection"]/@content',
            Compose(
                TakeFirst(),
                lambda x: x.split(','),
                TakeFirst(),
                lambda x: x.title(),
            ))

        # Video pages
        l.add_xpath('summary',
                    '//p[contains(@class, "vgm-video-description")]//text()')

        # USA Today provide timestamps to millisecond precision, in a format
        # which dateparser can't handle.
        l.add_xpath(
            'firstpubtime',
            '//*[@itemprop="datePublished" or @property="datePublished"]/@content',
            MapCompose(self.fix_usatoday_date))  # CreativeWork
        l.add_xpath(
            'modtime',
            '//*[@itemprop="dateModified" or @property="dateModified"]/@content',
            MapCompose(self.fix_usatoday_date))  # CreativeWork

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        return l.load_item()
    def parse_page(self, response):
        """@url http://www.dailymail.co.uk/news/article-4242322/Milo-Yiannopoulos-BANNED-CPAC-conference.html?ITO=1490
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """

        s = response.selector
        # Remove some content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.        
        mutate_selector_del_xpath(s, '//script')
        mutate_selector_del_xpath(s, '//*[@style="display:none"]')
        mutate_selector_del_xpath(s,
                                  '//div[contains(@class, "related-carousel")]'
                                 )

        l = NewsLoader(selector=s)

        # Get alternative to RSS-source URL fluff
        l.add_xpath('url', 'head/link[@rel="canonical"]/@href')

        drosss = (r' for (Dailymail.com|The Daily Mail|'
                    'Daily Mail Australia|MailOnline)')
        # Sort out bylines with less fluff
        l.add_xpath('bylines',
                    'head/meta[@property="article:author"]/@content',
                    MapCompose(split_multiple_byline_string,
                               lambda s: re.sub(drosss, r'', s, re.IGNORECASE)
                              )
                   )


        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()

        # TODO: JS dross in body; might need a standard solution to keep this
        #       out.
        # TODO: Related article dross in body. <div class=related-carousel>

        item = l.load_item()

#        self.logger.debug('bodytext', item['bodytext'])

        return item