Ejemplo n.º 1
0
    def parse_page(self, response):
        """@url http://www.mirror.co.uk/news/uk-news/lesbian-couple-who-launched-crowdfunding-9902318
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del_xpath(s, '//form')
        mutate_selector_del_xpath(
            s, '//aside[contains(@class,"read-more-links")]')

        l = NewsLoader(selector=s)

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_css('bodytext', '.article-body ::text')  # Live

        return l.load_item()
Ejemplo n.º 2
0
    def parse_page(self, response):
        """@url https://www.nytimes.com/2017/02/28/science/california-aging-dams.html
        @returns items 1
        @scrapes bodytext bylines fetchtime firstpubtime modtime headline
        @scrapes keywords section source summary url
        """
        s = response.selector
        # Remove any content from the tree before passing it to the loader.
        # There aren't native scrapy loader/selector methods for this.
        mutate_selector_del(s, 'xpath',
                            '//footer[contains(@class, "story-footer")]')
        mutate_selector_del(s, 'css', '.nocontent')
        mutate_selector_del(s, 'css', '.visually-hidden')
        mutate_selector_del(s, 'css', '.newsletter-signup')

        l = NewsLoader(selector=s)

        l.add_value('source', 'New York Times')
        # Response header from NYT leads to non-canonical URL with ?_r=0 at end
        l.add_xpath('url', 'head/link[@rel="canonical"]/@href')

        # Add a number of items of data that should be standardised across
        # providers. Can override these (for TakeFirst() fields) by making
        # l.add_* calls above this line, or supplement gaps by making them
        # below.
        l.add_fromresponse(response)
        l.add_htmlmeta()
        l.add_schemaorg(response)
        l.add_opengraph()
        l.add_scrapymeta(response)

        l.add_xpath('headline',
                    '//*[contains(@class, "Post__headline")]//text()')
        l.add_xpath('section', '//*[contains(@class, "Post__kicker")]//text()')
        l.add_xpath(
            'bodytext', '//*[contains(@class, "story-body") or '
            'contains(@class, "Post__body")]//text()')
        l.add_xpath('bodytext',
                    '//div[contains(@class, "body--story")]//p//text()')
        l.add_css('bodytext', '.interactive-graphic ::text')

        return l.load_item()