def parse_page(self, response): """@url http://edition.cnn.com/2017/03/01/politics/joe-biden-hunter-beau/index.html @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath( s, '//div[contains(@class, "read-more-button")]') mutate_selector_del_xpath(s, '//div[contains(@class, "el__embedded")]') mutate_selector_del_xpath(s, '//div[contains(@class, "owl-carousel")]') l = NewsLoader(selector=s) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath( 'headline', '//article//meta[@itemprop="alternativeHeadline"]/@content') l.add_xpath('headline', '//h1[contains(@class, "headline")]/text()') return l.load_item()
def parse_page(self, response): """@url http://metro.co.uk/2017/02/22/telescope-spots-our-best-bet-for-finding-aliens-a-nearby-star-with-seven-earth-sized-planets-6464648/ @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) # articleBody full of headline/byline/fluff l.add_xpath('bodytext', '//div[contains(@class, "article-body")]//text()') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) # Sort out bylines with less fluff l.add_xpath( 'bylines', '//span[contains(@class, "byline")]//a[@rel="author"]//text()', MapCompose(lambda s: re.sub( r' For Metro\.co\.uk', r'', s, flags=re.IGNORECASE))) return l.load_item()
def parse_page(self, response): """@url http://www.prnewswire.com/news-releases/xti-aircraft-company-and-bye-aerospace-form-alliance-on-hybridelectric-vertical-takeoff-airplane-300418161.html @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime headline @scrapes keywords source summary url @noscrapes modtime section """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[contains(@class, "classname")]') l = NewsLoader(selector=s) l.add_value('source', 'PR Newswire') # Not parsing as head/meta for some reason l.add_xpath('summary', '//meta[@name="description"]/@content') l.add_xpath('bylines', '//meta[@name="author"]/@content') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) #l.add_schemaorg_bylines() #l.add_dublincore() l.add_xpath('firstpubtime', '//meta[@name="date"]/@content') return l.load_item()
def parse_page(self, response): """@url http://www.cbsnews.com/news/iraqi-boy-trapped-in-mosul-for-years-finally-reunited-with-mother/ @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() # l.add_schemaorg_mde(response, jsonld=True, rdfa=False, microdata=False) l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) # Media pages. NOTE: These can be multipage; this will only get the # first page's text. l.add_xpath('bodytext', '//div[contains(@class, "post")]//text()') l.add_xpath('bodytext', '//div[@itemid="#article-entry"]//text()') return l.load_item()
def parse_page(self, response): """@url https://www.yahoo.com/news/school-principal-trump-chants-crossed-line-hate-speech-155230984.html @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime headline @scrapes source summary url keywords @noscrapes modtime section """ # Depressing lack of modtime, keywords or section. s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) l.add_value('source', 'Yahoo! News [US]') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() # l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath('bodytext', '//div[contains(@class, "canvas-body")]/p/text()') # NOTE: Maybe modtime l.add_xpath('firstpubtime', '//div[contains(@class, "auth-attr")]//time/@datetime') l.add_xpath('bylines', '//div[contains(@class, "auth-attr")]//div[contains(@class, "author-name")]//text()') return l.load_item()
def parse_page(self, response): """@url http://www.mirror.co.uk/news/uk-news/lesbian-couple-who-launched-crowdfunding-9902318 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath(s, '//form') mutate_selector_del_xpath( s, '//aside[contains(@class,"read-more-links")]') l = NewsLoader(selector=s) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_css('bodytext', '.article-body ::text') # Live return l.load_item()
def parse_page(self, response): s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath( s, '//*[contains(@class, "print") or contains(@class, "hidden")]' ) # Physical print only l = NewsLoader(selector=s) # Remove referer params from end of URLs l.add_xpath('url', 'head/link[@rel="canonical"]/@href') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath( 'bodytext', '//div[@data-print="body"]/*[not(contains(@class, "user-bio") or contains(@class, "_shares") or contains(@class, "inline-promo"))]//text()' ) l.add_xpath('bodytext', '//div[contains(@class, "_item_text")]//text()') l.add_xpath( 'bodytext', '//article//*[contains(@class, "subbuzz-text") or ' 'contains(@class, "subbuzz__title")]//text()') return l.load_item()
def parse_page(self, response): s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. # CSS is better for operating on classes than XPath, otherwise # either will do. #mutate_selector_del(s, 'xpath' '//*[@id='someid']') #mutate_selector_del(s, 'css', '.classname') l = NewsLoader(selector=s) # There are multiple articles in a single (JS-rendered) Vice page. # We are interested only in the first. # There are also, unhelpfully, several levels of <div>s with classes # containing "article__body". We only want the ultimate one. l.add_xpath( 'bodytext', '(//article)[1]//div[contains(@class, "article__body") and contains(@class, "bod-")]//text()' ) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) #l.add_schemaorg_bylines() #l.add_dublincore() return l.load_item()
def parse_page(self, response): """@url http://uk.reuters.com/article/us-heart-nih-funding-idUKKBN16Y2EI @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del(s, 'css', 'div.related-content') l = NewsLoader(selector=s) l.add_value('source', 'Reuters [UK]') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) #l.add_opengraph() l.add_scrapymeta(response) #l.add_schemaorg_bylines() #l.add_dublincore() l.add_xpath( 'bodytext', '//span[@id="article-text"]/' '*[not(@class="author")]//text()') l.add_xpath('summary', '//meta[@name="description"]/@content') l.add_value('notes', 'fetchtime delayed by slow feed') return l.load_item()
def parse_page(self, response): """@url http://www.nbcnews.com/news/asian-america/denied-visas-u-s-tibet-women-s-soccer-team-hold-n728626 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath('bodytext', '//div[contains(@class, "article-body")]//text()') return l.load_item()
def parse_page(self, response): """@url http://www.foxnews.com/opinion/2017/02/28/if-trump-really-wants-to-restore-america-to-greatness-hell-have-to-compromise-with-democrats.html @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes section source summary url @noscrapes keywords """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) l.add_xpath('bodytext', '//*[contains(@class, "article-text")]//text()') l.add_xpath('section', '//*[contains(@class, "section-title")]//text()') l.add_xpath('section', 'head/meta[@name="prism-section"]/@content') # Well, this is awkward. Bylines (normally) not in metadata, and not # given a suitable class label in the HTML source. l.add_xpath('bylines', '//div[contains(@class, "article-info")]//p[contains(., "By")]/span//text()') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_dublincore() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) return l.load_item()
def parse_page(self, response): s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) l.add_xpath( 'bylines', '//*[contains(@class, "author-card__details__name")]//text()') # UK l.add_xpath('bodytext', '//div[contains(@class, "entry__body")]//text()') # DE l.add_xpath('bodytext', '//div[@id="mainentrycontent"]//text()') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) return l.load_item()
def parse_page(self, response): """@url http://abcnews.go.com/Politics/house-intelligence-committee-sets-framework-russian-probe/story?id=45846073 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath('section', '//article/@data-section') l.add_xpath('modtime', 'head/meta[@name="Last-Modified"]/@content') l.add_xpath('firstpubtime', '//div[contains(@class, "article-meta")]//span[contains(@class, "timestamp")]/text()', self._strip_timestamp) return l.load_item()
def parse_page(self, response): """@url http://www.independent.co.uk/news/world/americas/muslim-american-activist-tarek-el-messidi-jewish-cemetery-mt-carmel-philadelphia-vandalised-a7601266.html @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. # Picture galleries, generally unrelated to the story mutate_selector_del(s, 'css', '.type-gallery') # "More about" grot mutate_selector_del( s, 'xpath', '//li[contains(text(), "More about")]/' 'parent::*[contains(@class, ' '"inline-pipes-list")]') l = NewsLoader(selector=s) l.add_xpath( 'bylines', '//article//*[@itemprop="author"]//*[@itemprop="name"]//text()') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) return l.load_item()
def parse_page(self, response): # firstpubtime from RSS feed, so won't appear for contract. """@url http://www.bbc.co.uk/news/uk-politics-39020260 @returns items 1 @scrapes bodytext fetchtime headline @scrapes section source summary url @noscrapes modtime keywords """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del(s, 'xpath', '//*[@class="off-screen"]') l = NewsLoader(selector=s) l.add_value('source', 'BBC News') # BBC titles all have dross at the end, even the embedded ones. l.add_xpath('headline', 'head/title/text()', lambda x: [re.sub(r' - BBC (News(beat)?|Sport)$', '', x[0])]) # TODO: Publishes data (including datePublished) as JSON+LD. # Need parser. Note that it doesn't seem complete: articleBody in # the JSON+LD feed seems to only contain the standfirst. # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath('bodytext', '//div[contains(@class, "main_article_text")]//text()') # Newsbeat l.add_xpath('bodytext', '//div[contains(@class, "map-body")]//text()') # media-asset-page l.add_xpath('bodytext', '//div[contains(@class, "story-body")]//text()') # Sport l.add_xpath('summary', '//div[contains(@class, "vxp-media__summary")]//text()') # Videos l.add_xpath('bodytext', '//div[contains(@class, "vxp-media__summary")]//text()') # Videos match = re.match # Newsbeat seems to use a different CMS, which doesn't supply the # usual metadata (but which does publish bylines!) if response.xpath('//div[contains(@class, "newsbeatlogo")]'): l.add_value('section', 'Newsbeat') # def strip_by(strl): # for s in strl: # yield re.sub(r'.*[Bb]y (.*)', r'\1', s).strip() l.add_xpath('bylines', '//span[contains(@class, "byline__name")]/text()')#, strip_by) # lambda y: map(lambda x: re.sub(r'.*By (.*)', r'\1', x).strip(), y)) l.add_xpath('bylines', '//p[contains(@class, "byline")]/text()')#, strip_by) # lambda y: map(lambda x: re.sub(r'.*By (.*)', r'\1', x).strip(), y)) # Newsbeat l.add_xpath('bylines', '//*[contains(@class, "story__byline")]//p[contains(@class, "gel-long-primer") and not(contains(@class, "gel-long-primer-bold"))]/text()') # Sport. Grot selecting by layout code. # TODO: Keywords (none?) return l.load_item()
def parse_page(self, response): """@url http://www.usatoday.com/story/money/markets/2017/02/28/bonds-telling-less-bullish-tale-than-stocks/98503646/ @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath( s, '//*[contains(@class, "inline-share-tools")]') mutate_selector_del_xpath( s, '//*[contains(@class, "article-print-url")]') mutate_selector_del_xpath(s, '//aside') l = NewsLoader(selector=s) l.add_xpath('bylines', 'head/meta[@name="cXenseParse:author"]/@content') # Section metadata comes out as "news,world". For this, take "News". l.add_xpath( 'section', 'head/meta[@itemprop="articleSection"]/@content', Compose( TakeFirst(), lambda x: x.split(','), TakeFirst(), lambda x: x.title(), )) # Video pages l.add_xpath('summary', '//p[contains(@class, "vgm-video-description")]//text()') # USA Today provide timestamps to millisecond precision, in a format # which dateparser can't handle. l.add_xpath( 'firstpubtime', '//*[@itemprop="datePublished" or @property="datePublished"]/@content', MapCompose(self.fix_usatoday_date)) # CreativeWork l.add_xpath( 'modtime', '//*[@itemprop="dateModified" or @property="dateModified"]/@content', MapCompose(self.fix_usatoday_date)) # CreativeWork # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) return l.load_item()
def parse_page(self, response): """@url http://www.dailymail.co.uk/news/article-4242322/Milo-Yiannopoulos-BANNED-CPAC-conference.html?ITO=1490 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove some content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath(s, '//script') mutate_selector_del_xpath(s, '//*[@style="display:none"]') mutate_selector_del_xpath( s, '//div[contains(@class, "related-carousel")]') l = NewsLoader(selector=s) # Get alternative to RSS-source URL fluff l.add_xpath('url', 'head/link[@rel="canonical"]/@href') l.add_value( 'bylines', list( map( lambda x: x.get(), response.xpath( "//span[@data-component='Byline']//span[@data-component='Text']//a/text()" )))) l.add_value('headline', response.xpath("//h1[@data-component='Headline']/text()")) bodytext = "" for r in response.xpath('//div[@id="body"]//p//text()'): bodytext += r.extract() l.add_value('bodytext', bodytext) #l.add_xpath('keywords', # 'head/meta[@property="keywords"]/@content') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_scrapymeta(response) l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_readability(response) # TODO: JS dross in body; might need a standard solution to keep this # out. # TODO: Related article dross in body. <div class=related-carousel> item = l.load_item() # self.logger.debug('bodytext', item['bodytext']) return item
def parse_page(self, response): """@url https://www.bild.de/politik/ausland/politik-ausland/wef-in-davos-die-top-gaeste-und-die-wichtigsten-themen-67441554.bild.html @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url language """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. # Drop noscript JS warnings (which are otherwise included as the # bodytext for video pages). mutate_selector_del(s, 'xpath', '//noscript[contains(@class, "warning")]') # Remove BildPLUS subscribe notice (note that BildPLUS articles are # paywalled, and the text fetched will be only the opening paragraphs. mutate_selector_del( s, 'xpath', '//strong[text()="Ihre neuesten Erkenntnisse lesen Sie mit BILDplus."]' ) # Remove "related topics" etc. mutate_selector_del(s, 'xpath', '//aside[contains(@class, "related-topics")]') mutate_selector_del( s, 'xpath', '//div[contains(@class, "tsr-info") and contains(text(), "Lesen Sie auch")]' ) l = NewsLoader(selector=s) # Breadcrumbs section l.add_xpath('section', '//div[@id="breadcrumb"]//a[@rel="home"]//text()') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_readability(response) #l.add_schemaorg_bylines() #l.add_dublincore() # Readability is pretty good on Bild, but sometimes misses the body. # Try fallback (this won't be as clean as the readability version as # we haven't removed all the "more reading" sections etc.) l.add_xpath('bodytext', '//div[contains(@class, "txt")]//text()') l.add_xpath('bodytext', '//div[contains(@class, "article-body")]//text()') return l.load_item()
def parse_page(self, response): """@url https://www.theguardian.com/business/2017/feb/20/how-unilever-foiled-kraft-heinzs-115m-takeover-bid-warren-buffett @returns items 1 @scrapes bodytext fetchtime firstpubtime headline bylines @scrapes section source summary url modtime keywords """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. # CSS is better for operating on classes than XPath, otherwise # either will do. # 1. Strip the submeta footer mutate_selector_del(s, 'xpath', '//div[contains(@class, "submeta")]') # 2. All the <aside> boxes mutate_selector_del(s, 'xpath', '//aside') l = NewsLoader(selector=s) l.add_value('source', 'The Guardian') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() # Some Guardian articles are missing their OpenGraph article section # tag. These data-link-name tags are often multiple. l.add_xpath('section', '//a[@data-link-name="article section"]/text()', TakeFirst(), lambda x: x.strip()) # The body tagging varies depending on the type of article, so let's # try several # TODO: There's still a bit of grot in this: <aside> tags, the social links # under videos etc. # TODO: The <span class="drop-cap"> setup leaves a spurious line break # after the first letter, which results in a space in the output # text. l.add_xpath( 'bodytext', '//article//div[contains(@class, "content__main-column")]/*[not(contains(@class, "meta"))]//text()' ) # Eyewitness, plus video? # l.add_xpath('bodytext', '//div[@data-component="body"]//*[not(contains(@class, "meta"))]//text()') # Video # l.add_xpath('bodytext', '//div[@id="mainCol"]//text()') # Australian poll briefing # l.add_xpath('bodytext', '//ul[contains(@class, "gallery")]//text()') # In Pictures # l.add_xpath('bodytext', '//div[contains(@class, "gv-slice") and contains(@class, "second-strip")]//text()') # Interactive # #item['headline'] = join_strip_list(response.xpath('//h1//text()').extract()), # #item['bylines'] = response.xpath('//p[@class="byline"]//span[@itemprop="name"]/text()').extract(), return l.load_item()
def parse_page(self, response): s = response.selector mutate_selector_del_xpath(s, '//script') mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_readability(response) item = l.load_item() return item
def parse_page(self, response): """@url http://www.dailymail.co.uk/news/article-4242322/Milo-Yiannopoulos-BANNED-CPAC-conference.html?ITO=1490 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove some content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath(s, '//script') mutate_selector_del_xpath(s, '//*[@style="display:none"]') mutate_selector_del_xpath(s, '//div[contains(@class, "related-carousel")]' ) l = NewsLoader(selector=s) # Get alternative to RSS-source URL fluff l.add_xpath('url', 'head/link[@rel="canonical"]/@href') drosss = (r' for (Dailymail.com|The Daily Mail|' 'Daily Mail Australia|MailOnline)') # Sort out bylines with less fluff l.add_xpath('bylines', 'head/meta[@property="article:author"]/@content', MapCompose(split_multiple_byline_string, lambda s: re.sub(drosss, r'', s, re.IGNORECASE) ) ) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() # TODO: JS dross in body; might need a standard solution to keep this # out. # TODO: Related article dross in body. <div class=related-carousel> item = l.load_item() # self.logger.debug('bodytext', item['bodytext']) return item
def parse_page(self, response): """@url http://www.businessinsider.fr/meilleures-entreprises-equilibre-vie-pro-vie-privee-selon-glassdoor?IR=C @returns items 1 @scrapes bodytext bylines fetchtime modtime headline @scrapes section source summary url language @noscrapes keywords """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. # CSS is better for operating on classes than XPath, otherwise # either will do. # mutate_selector_del(s, 'xpath', '//div[@id="see-also-links"]') # mutate_selector_del(s, 'xpath', '//div[contains(@class, "popular-video")]') # mutate_selector_del(s, 'xpath', '//span[contains(@class, "caption-source")]') mutate_selector_del(s, 'xpath', '//p[contains(@class, "wp-caption-text")]') mutate_selector_del(s, 'xpath', '//div[contains(@class, "pod-fb-like")]') l = NewsLoader(selector=s) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) #l.add_schemaorg_bylines() #l.add_dublincore() l.add_xpath('bodytext', '//div[contains(@class, "post-content")]//text()') l.add_xpath('bylines', '//a[@rel="author"]//text()') # BI prints times for recent articles as "hours since published". But # helpfully includes a unix timestamp in its metadata. # ts = s.xpath('//span[@data-bi-format="date"]/@rel').extract_first() # if ts: # l.add_value('modtime', datetime.fromtimestamp(int(ts)).isoformat()) # l.add_xpath('section', '//h2[contains(@class, "vert-name")]//text()') return l.load_item()
def parse_page(self, response): """@url https://www.thesun.co.uk/living/2937147/human-ken-doll-quentin-dehar-who-spent-92k-to-look-like-his-idol-has-dumped-his-surgery-obsessed-barbie-girlfriend-for-dying-her-hair-brown/ @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. # TODO: 'Keywords' and 'tags' for The Sun are different. Decide which # which we want. # Lose "The Sun" link on the bottom of each page mutate_selector_del_xpath( s, '//div[contains(@class, "social--fb-page-button")]') # Lose the "related articles" carousel mutate_selector_del_xpath(s, '//div[contains(@class, "rail--trending")]') l = NewsLoader(selector=s) l.add_xpath('summary', 'meta[@name="description"]/@content') # TODO: This is kinda grot. Fine except for names like "John da Silva". l.add_xpath( 'bylines', '//span[contains(@class, "article__author-name")]//text()', lambda x: (s.title() for s in x)) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath( 'bodytext', '//article//div[contains(@class, "article__content")]//text()') return l.load_item()
def parse_page(self, response): """Note: firstpubtime also fetched, but via RSS feed (which can't be contracted for) @url https://www.buzzfeed.com/maryanngeorgantopoulos/white-supremacists-are-spreading-their-message-on-college-ca @returns items 1 @scrapes bodytext bylines fetchtime headline @scrapes section source summary url keywords language @noscrapes modtime """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del_xpath( s, '//*[contains(@class, "print") or contains(@class, "hidden")]' ) # Physical print only l = NewsLoader(selector=s) # Remove referer params from end of URLs l.add_xpath('url', 'head/link[@rel="canonical"]/@href') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath( 'bodytext', '//div[@data-print="body"]/*[not(contains(@class, "user-bio") or contains(@class, "_shares") or contains(@class, "inline-promo"))]//text()' ) l.add_xpath('bodytext', '//div[contains(@class, "_item_text")]//text()') l.add_xpath( 'bodytext', '//article//*[contains(@class, "subbuzz-text") or ' 'contains(@class, "subbuzz__title")]//text()') return l.load_item()
def parse_page(self, response): """@url https://www.nytimes.com/2017/02/28/science/california-aging-dams.html @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. mutate_selector_del(s, 'xpath', '//footer[contains(@class, "story-footer")]') mutate_selector_del(s, 'css', '.nocontent') mutate_selector_del(s, 'css', '.visually-hidden') mutate_selector_del(s, 'css', '.newsletter-signup') l = NewsLoader(selector=s) l.add_value('source', 'New York Times') # Response header from NYT leads to non-canonical URL with ?_r=0 at end l.add_xpath('url', 'head/link[@rel="canonical"]/@href') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_xpath('headline', '//*[contains(@class, "Post__headline")]//text()') l.add_xpath('section', '//*[contains(@class, "Post__kicker")]//text()') l.add_xpath( 'bodytext', '//*[contains(@class, "story-body") or ' 'contains(@class, "Post__body")]//text()') l.add_xpath('bodytext', '//div[contains(@class, "body--story")]//p//text()') l.add_css('bodytext', '.interactive-graphic ::text') return l.load_item()
def parse_page(self, response): """@url http://bigstory.ap.org/article/fc451fdf7e9a47c1b2b9ab95f55c3bfe/tusk-closing-2nd-term-eu-council-president @returns items 1 @scrapes bodytext bylines fetchtime modtime headline @scrapes keywords source summary url @noscrapes firstpubtime section """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[contains(@class, "classname")]') l = NewsLoader(selector=s) l.add_value('source', 'Associated Press') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) #l.add_schemaorg_bylines() #l.add_dublincore() l.add_xpath('headline', 'head/title/text()') l.add_xpath('summary', 'head/meta[@name="description"]/@content') l.add_xpath('bylines', '//div[@id="byline"]//a/text()') l.add_xpath('bodytext', '//div[contains(@class, "field-name-body")]//text()') l.add_xpath( 'modtime', '//div[@id="dateline"]/span[contains(@class, "updated")]/@title') # These are sometimes exposed as <meta name='keywords'>, sometimes not. l.add_xpath('keywords', '//div[contains(@class, "tags")]//a/text()') return l.load_item()
def parse_page(self, response): """@url http://www.prnewswire.co.uk/news-releases/virtual-reality-huge-investment-support-accelerates-innovations-and-expands-application-scope-615544713.html @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime headline @scrapes keywords source summary url @noscrapes modtime section """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[contains(@class, "classname")]') l = NewsLoader(selector=s) l.add_value('source', 'PR Newswire [UK]') # Not parsing as head/meta for some reason l.add_xpath('summary', '//meta[@name="description"]/@content') l.add_xpath('bylines', '//meta[@name="author"]/@content') l.add_xpath('keywords', '//meta[@name="keywords"]/@content') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) #l.add_schemaorg_bylines() #l.add_dublincore() l.add_xpath('firstpubtime', '//meta[@name="date"]/@content') l.add_xpath('bodytext', '//div[contains(@class, "news-col")]//text()') l.add_xpath('headline', '//h1/text()') return l.load_item()
def parse_page(self, response): """@url https://www.washingtonpost.com/news/politics/wp/2017/03/27/trumps-approval-hits-a-new-low-of-36-percent-but-thats-not-the-bad-news/ @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime headline @scrapes keywords section source summary url @noscrapes modtime """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) # WaPo's ISO date/time strings are invalid: <datetime>-500 instead of # <datetime>-05:00. Note that the various standardised l.add_* methods # will generate 'Failed to parse data' log items. We've got it properly # here, so they aren't important. l.add_xpath('firstpubtime', '//*[@itemprop="datePublished" or ' '@property="datePublished"]/@content', MapCompose(self.fix_iso_date)) # CreativeWork # These are duplicated in the markup, so uniquise them. l.add_xpath('bylines', '//*[@itemprop="author"]//*[@itemprop="name"]//text()', set) l.add_xpath('section', '//*[contains(@class, "headline-kicker")]//text()') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) return l.load_item()
def parse_page(self, response): """@url http://www.telegraph.co.uk/news/2017/02/27/grandmother-has-married-briton-27-years-deported-singapore-just/ @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime headline @scrapes keywords section source summary url @noscrapes modtime """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. #mutate_selector_del_xpath(s, '//*[@style="display:none"]') l = NewsLoader(selector=s) # This extracts the (top-level) section from the Navigation headline # bar. Probably a bit fragile. l.add_xpath( 'section', '//a[contains(@class, "header-breadcrumbs__link")]//text()', TakeFirst()) l.add_xpath( 'bylines', '//main//*[@itemprop="author"]//*[@itemprop="name"]//text()') # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) if response.xpath('//div[contains(@class, "premium-paywall")]'): l.add_value('notes', 'Premium paywall') return l.load_item()
def parse_page(self, response): """@url https://www.liverpoolecho.co.uk/news/liverpool-news/police-issue-warning-over-gift-19660932 @returns items 1 @scrapes bodytext bylines fetchtime firstpubtime modtime headline @scrapes keywords section source summary url language """ s = response.selector # Remove any content from the tree before passing it to the loader. # There aren't native scrapy loader/selector methods for this. # CSS is better for operating on classes than XPath, otherwise # either will do. mutate_selector_del(s, 'xpath', '//aside') #mutate_selector_del(s, 'css', '.classname') l = NewsLoader(selector=s) # Add a number of items of data that should be standardised across # providers. Can override these (for TakeFirst() fields) by making # l.add_* calls above this line, or supplement gaps by making them # below. l.add_fromresponse(response) l.add_htmlmeta() l.add_schemaorg(response) l.add_opengraph() l.add_scrapymeta(response) l.add_readability(response) #l.add_schemaorg_bylines() #l.add_dublincore() l.add_xpath('articleid', '//meta[@property="article:id"]/@content') # We're going to try to scrape the comments. This is a little involved. # In order to get comments, we need two things: # 1. The site_uuid (which *should* probably be permanently fixed, but # which live pages nevertheless bootstrap every time # 2. The content_container_uuid # The content_container_uuid can't be had without knowing both the site # uuid and the container_id (which is embedded in the page metadata). # # We have obtained the site_uuid at crawler startup using this class's # start_requests() function. But the content_container_uuid is different # for every article. So we need to fetch it now, and continue the # processing of this article in the callback. # # We are, therefore, going to yield a new Request, containing our # half-finished loader as a metadata item. The callback function for the # new Request will try to fetch comments and add them in, finishing by # yielding a complete Item for the crawler to handle. if l.get_xpath('//vf-conversations') and self.comments_bootstrap: site_uuid = self.comments_bootstrap['settings']['site_uuid'] containerid = l.get_xpath( '//meta[@name="vf:container_id"]/@content')[0] yield Request( f"https://livecomments.viafoura.co/v4/livecomments/{site_uuid}/contentcontainer/id?container_id={containerid}", method="GET", priority=5, callback=self.parse_comments_get_contentcontainer, errback=self.errback_comments, cb_kwargs={ 'l': l, 'site_uuid': site_uuid }, meta={ # We don't even want to fetch robots.txt here. 'dont_obey_robotstxt': True }) else: logger.debug(f'No comments section: {response.url}') l.add_value('notes', 'No comments section') yield l.load_item()