def parse_page(self, response): """@url http://www.mirror.co.uk/news/uk-news/lesbian-couple-who-launched-crowdfunding-9902318 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath(s, '//form') mutate_selector_del_xpath( s, '//aside[contains(@class,"read-more-links")]') l = NewsLoader(selector=s) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_css('bodytext', '.article-body ::text') # Live return l.load_item()
def parse_page(self, response): """@url https://www.nytimes.com/2017/02/28/science/california-aging-dams.html @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del(s, 'xpath', '//footer[contains(@class, "story-footer")]') mutate_selector_del(s, 'css', '.nocontent') mutate_selector_del(s, 'css', '.visually-hidden') mutate_selector_del(s, 'css', '.newsletter-signup') l = NewsLoader(selector=s) l.add_value('source', 'New York Times') # Response header from NYT leads to non-canonical URL with ?_r=0 at end l.add_xpath('url', 'head/link[@rel="canonical"]/@href') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath('headline', '//*[contains(@class, "Post__headline")]//text()') l.add_xpath('section', '//*[contains(@class, "Post__kicker")]//text()') l.add_xpath( 'bodytext', '//*[contains(@class, "story-body") or ' 'contains(@class, "Post__body")]//text()') l.add_xpath('bodytext', '//div[contains(@class, "body--story")]//p//text()') l.add_css('bodytext', '.interactive-graphic ::text') return l.load_item()