def parse_page(self, response): s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath( s, '//*[contains(@class, "print") or contains(@class, "hidden")]' ) # Physical print only l = NewsLoader(selector=s) # Remove referer params from end of URLs l.add_xpath('url', 'head/link[@rel="canonical"]/@href') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath( 'bodytext', '//div[@data-print="body"]/*[not(contains(@class, "user-bio") or contains(@class, "_shares") or contains(@class, "inline-promo"))]//text()' ) l.add_xpath('bodytext', '//div[contains(@class, "_item_text")]//text()') l.add_xpath( 'bodytext', '//article//*[contains(@class, "subbuzz-text") or ' 'contains(@class, "subbuzz__title")]//text()') return l.load_item()
def parse_page(self, response): """@url http://www.mirror.co.uk/news/uk-news/lesbian-couple-who-launched-crowdfunding-9902318 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath(s, '//form') mutate_selector_del_xpath( s, '//aside[contains(@class,"read-more-links")]') l = NewsLoader(selector=s) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_css('bodytext', '.article-body ::text') # Live return l.load_item()
def parse_page(self, response): s = response.selector mutate_selector_del_xpath(s, '//script') mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_readability(response) item = l.load_item() return item
def parse_page(self, response): """@url https://www.thesun.co.uk/living/2937147/human-ken-doll-quentin-dehar-who-spent-92k-to-look-like-his-idol-has-dumped-his-surgery-obsessed-barbie-girlfriend-for-dying-her-hair-brown/ @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. # TODO: 'Keywords' and 'tags' for The Sun are different. Decide which # which we want. # Lose "The Sun" link on the bottom of each page mutate_selector_del_xpath( s, '//div[contains(@class, "social--fb-page-button")]') # Lose the "related articles" carousel mutate_selector_del_xpath(s, '//div[contains(@class, "rail--trending")]') l = NewsLoader(selector=s) l.add_xpath('summary', 'meta[@name="description"]/@content') # TODO: This is kinda grot. Fine except for names like "John da Silva". l.add_xpath( 'bylines', '//span[contains(@class, "article__author-name")]//text()', lambda x: (s.title() for s in x)) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath( 'bodytext', '//article//div[contains(@class, "article__content")]//text()') return l.load_item()
def parse_page(self, response): """Note: firstpubtime also fetched, but via RSS feed (which can't be contracted for) @url https://www.buzzfeed.com/maryanngeorgantopoulos/white-supremacists-are-spreading-their-message-on-college-ca @returns items 1 @scrapes bodytext bylines fetchtime headline @scrapes section source summary url keywords language @noscrapes modtime """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath( s, '//*[contains(@class, "print") or contains(@class, "hidden")]' ) # Physical print only l = NewsLoader(selector=s) # Remove referer params from end of URLs l.add_xpath('url', 'head/link[@rel="canonical"]/@href') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath( 'bodytext', '//div[@data-print="body"]/*[not(contains(@class, "user-bio") or contains(@class, "_shares") or contains(@class, "inline-promo"))]//text()' ) l.add_xpath('bodytext', '//div[contains(@class, "_item_text")]//text()') l.add_xpath( 'bodytext', '//article//*[contains(@class, "subbuzz-text") or ' 'contains(@class, "subbuzz__title")]//text()') return l.load_item()
def parse_page(self, response): """@url http://edition.cnn.com/2017/03/01/politics/joe-biden-hunter-beau/index.html @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath( s, '//div[contains(@class, "read-more-button")]') mutate_selector_del_xpath(s, '//div[contains(@class, "el__embedded")]') mutate_selector_del_xpath(s, '//div[contains(@class, "owl-carousel")]') l = NewsLoader(selector=s) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath( 'headline', '//article//meta[@itemprop="alternativeHeadline"]/@content') l.add_xpath('headline', '//h1[contains(@class, "headline")]/text()') return l.load_item()
def parse_page(self, response): """@url http://www.dailymail.co.uk/news/article-4242322/Milo-Yiannopoulos-BANNED-CPAC-conference.html?ITO=1490 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove some content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath(s, '//script') mutate_selector_del_xpath(s, '//*[@style="display:none"]') mutate_selector_del_xpath( s, '//div[contains(@class, "related-carousel")]') l = NewsLoader(selector=s) # Get alternative to RSS-source URL fluff l.add_xpath('url', 'head/link[@rel="canonical"]/@href') l.add_value( 'bylines', list( map( lambda x: x.get(), response.xpath( "//span[@data-component='Byline']//span[@data-component='Text']//a/text()" )))) l.add_value('headline', response.xpath("//h1[@data-component='Headline']/text()")) bodytext = "" for r in response.xpath('//div[@id="body"]//p//text()'): bodytext += r.extract() l.add_value('bodytext', bodytext) #l.add_xpath('keywords', # 'head/meta[@property="keywords"]/@content') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_scrapymeta(response) l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_readability(response) # TODO: JS dross in body; might need a standard solution to keep this # out. # TODO: Related article dross in body. <div class=related-carousel> item = l.load_item() # self.logger.debug('bodytext', item['bodytext']) return item
def parse_page(self, response): """@url http://www.usatoday.com/story/money/markets/2017/02/28/bonds-telling-less-bullish-tale-than-stocks/98503646/ @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath( s, '//*[contains(@class, "inline-share-tools")]') mutate_selector_del_xpath( s, '//*[contains(@class, "article-print-url")]') mutate_selector_del_xpath(s, '//aside') l = NewsLoader(selector=s) l.add_xpath('bylines', 'head/meta[@name="cXenseParse:author"]/@content') # Section metadata comes out as "news,world". For this, take "News". l.add_xpath( 'section', 'head/meta[@itemprop="articleSection"]/@content', Compose( TakeFirst(), lambda x: x.split(','), TakeFirst(), lambda x: x.title(), )) # Video pages l.add_xpath('summary', '//p[contains(@class, "vgm-video-description")]//text()') # USA Today provide timestamps to millisecond precision, in a format # which dateparser can't handle. l.add_xpath( 'firstpubtime', '//*[@itemprop="datePublished" or @property="datePublished"]/@content', MapCompose(self.fix_usatoday_date)) # CreativeWork l.add_xpath( 'modtime', '//*[@itemprop="dateModified" or @property="dateModified"]/@content', MapCompose(self.fix_usatoday_date)) # CreativeWork # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) return l.load_item()
def parse_page(self, response): """@url http://www.dailymail.co.uk/news/article-4242322/Milo-Yiannopoulos-BANNED-CPAC-conference.html?ITO=1490 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove some content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath(s, '//script') mutate_selector_del_xpath(s, '//*[@style="display:none"]') mutate_selector_del_xpath(s, '//div[contains(@class, "related-carousel")]' ) l = NewsLoader(selector=s) # Get alternative to RSS-source URL fluff l.add_xpath('url', 'head/link[@rel="canonical"]/@href') drosss = (r' for (Dailymail.com|The Daily Mail|' 'Daily Mail Australia|MailOnline)') # Sort out bylines with less fluff l.add_xpath('bylines', 'head/meta[@property="article:author"]/@content', MapCompose(split_multiple_byline_string, lambda s: re.sub(drosss, r'', s, re.IGNORECASE) ) ) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() # TODO: JS dross in body; might need a standard solution to keep this # out. # TODO: Related article dross in body. <div class=related-carousel> item = l.load_item() # self.logger.debug('bodytext', item['bodytext']) return item