def parse_page(self, response): """@url http://www.dailymail.co.uk/news/article-4242322/Milo-Yiannopoulos-BANNED-CPAC-conference.html?ITO=1490 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove some content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath(s, '//script') mutate_selector_del_xpath(s, '//*[@style="display:none"]') mutate_selector_del_xpath( s, '//div[contains(@class, "related-carousel")]') l = NewsLoader(selector=s) # Get alternative to RSS-source URL fluff l.add_xpath('url', 'head/link[@rel="canonical"]/@href') l.add_value( 'bylines', list( map( lambda x: x.get(), response.xpath( "//span[@data-component='Byline']//span[@data-component='Text']//a/text()" )))) l.add_value('headline', response.xpath("//h1[@data-component='Headline']/text()")) bodytext = "" for r in response.xpath('//div[@id="body"]//p//text()'): bodytext += r.extract() l.add_value('bodytext', bodytext) #l.add_xpath('keywords', # 'head/meta[@property="keywords"]/@content') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_scrapymeta(response) l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_readability(response) # TODO: JS dross in body; might need a standard solution to keep this # out. # TODO: Related article dross in body. <div class=related-carousel> item = l.load_item() # self.logger.debug('bodytext', item['bodytext']) return item
def parse_page(self, response): """@url https://www.bild.de/politik/ausland/politik-ausland/wef-in-davos-die-top-gaeste-und-die-wichtigsten-themen-67441554.bild.html @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url language """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. # Drop noscript JS warnings (which are otherwise included as the # bodytext for video pages). mutate_selector_del(s, 'xpath', '//noscript[contains(@class, "warning")]') # Remove BildPLUS subscribe notice (note that BildPLUS articles are # paywalled, and the text fetched will be only the opening paragraphs. mutate_selector_del( s, 'xpath', '//strong[text()="Ihre neuesten Erkenntnisse lesen Sie mit BILDplus."]' ) # Remove "related topics" etc. mutate_selector_del(s, 'xpath', '//aside[contains(@class, "related-topics")]') mutate_selector_del( s, 'xpath', '//div[contains(@class, "tsr-info") and contains(text(), "Lesen Sie auch")]' ) l = NewsLoader(selector=s) # Breadcrumbs section l.add_xpath('section', '//div[@id="breadcrumb"]//a[@rel="home"]//text()') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_readability(response) #l.add_schemaorg_bylines() #l.add_dublincore() # Readability is pretty good on Bild, but sometimes misses the body. # Try fallback (this won't be as clean as the readability version as # we haven't removed all the "more reading" sections etc.) l.add_xpath('bodytext', '//div[contains(@class, "txt")]//text()') l.add_xpath('bodytext', '//div[contains(@class, "article-body")]//text()') return l.load_item()
def parse_page(self, response): s = response.selector mutate_selector_del_xpath(s, '//script') mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_readability(response) item = l.load_item() return item
def parse_page(self, response): """@url https://www.theguardian.com/business/2017/feb/20/how-unilever-foiled-kraft-heinzs-115m-takeover-bid-warren-buffett @returns items 1 @scrapes bodytext fetchtime firstpubtime headline bylines @scrapes section source summary url modtime keywords """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. # CSS is better for operating on classes than XPath, otherwise # either will do. # 1. Strip the submeta footer mutate_selector_del(s, 'xpath', '//div[contains(@class, "submeta")]') # 2. All the <aside> boxes mutate_selector_del(s, 'xpath', '//aside') l = NewsLoader(selector=s) l.add_value('source', 'The Guardian') try: l.add_value( 'bylines', list( map(lambda x: x['name'], response.meta['json-ld'][0][0]['author']))) l.add_value('headline', response.meta['json-ld'][0][0]['headline']) except: print("No bylines in json-ld") # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() # Some Guardian articles are missing their OpenGraph article section # tag. These data-link-name tags are often multiple. l.add_xpath('section', '//a[@data-link-name="article section"]/text()', TakeFirst(), lambda x: x.strip()) # The body tagging varies depending on the type of article, so let's # try several # TODO: There's still a bit of grot in this: <aside> tags, the social links # under videos etc. # TODO: The <span class="drop-cap"> setup leaves a spurious line break # after the first letter, which results in a space in the output # text. l.add_xpath( 'bodytext', '//article//div[contains(@class, "content__main-column")]/*[not(contains(@class, "meta"))]//text()' ) # Eyewitness, plus video? l.add_readability(response) # l.add_xpath('bodytext', '//div[@data-component="body"]//*[not(contains(@class, "meta"))]//text()') # Video # l.add_xpath('bodytext', '//div[@id="mainCol"]//text()') # Australian poll briefing # l.add_xpath('bodytext', '//ul[contains(@class, "gallery")]//text()') # In Pictures # l.add_xpath('bodytext', '//div[contains(@class, "gv-slice") and contains(@class, "second-strip")]//text()') # Interactive # #item['headline'] = join_strip_list(response.xpath('//h1//text()').extract()), # #item['bylines'] = response.xpath('//p[@class="byline"]//span[@itemprop="name"]/text()').extract(), return l.load_item()
def parse_page(self, response): """@url https://www.liverpoolecho.co.uk/news/liverpool-news/police-issue-warning-over-gift-19660932 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url language """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. # CSS is better for operating on classes than XPath, otherwise # either will do. mutate_selector_del(s, 'xpath', '//aside') #mutate_selector_del(s, 'css', '.classname') l = NewsLoader(selector=s) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_readability(response) #l.add_schemaorg_bylines() #l.add_dublincore() l.add_xpath('articleid', '//meta[@property="article:id"]/@content') # We're going to try to scrape the comments. This is a little involved. # In order to get comments, we need two things: # 1. The site_uuid (which *should* probably be permanently fixed, but # which live pages nevertheless bootstrap every time # 2. The content_container_uuid # The content_container_uuid can't be had without knowing both the site # uuid and the container_id (which is embedded in the page metadata). # # We have obtained the site_uuid at crawler startup using this class's # start_requests() function. But the content_container_uuid is different # for every article. So we need to fetch it now, and continue the # processing of this article in the callback. # # We are, therefore, going to yield a new Request, containing our # half-finished loader as a metadata item. The callback function for the # new Request will try to fetch comments and add them in, finishing by # yielding a complete Item for the crawler to handle. if l.get_xpath('//vf-conversations') and self.comments_bootstrap: site_uuid = self.comments_bootstrap['settings']['site_uuid'] containerid = l.get_xpath( '//meta[@name="vf:container_id"]/@content')[0] yield Request( f"https://livecomments.viafoura.co/v4/livecomments/{site_uuid}/contentcontainer/id?container_id={containerid}", method="GET", priority=5, callback=self.parse_comments_get_contentcontainer, errback=self.errback_comments, cb_kwargs={ 'l': l, 'site_uuid': site_uuid }, meta={ # We don't even want to fetch robots.txt here. 'dont_obey_robotstxt': True }) else: logger.debug(f'No comments section: {response.url}') l.add_value('notes', 'No comments section') yield l.load_item()