def _parse_episode(self, response): il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), timezone="Europe/Vienna", dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "title", '//meta[@name="title"]/@content', re=r"(?s)(.*?)(?: vom .*)? - puls4\.com", ) il.add_value( "updated", "{} {}".format( response.xpath('//meta[@name="title"]/@content').re_first( r".*vom (\d{2}\.\d{2}\.\d{4}).*" ), response.meta["time"] or "00:00", ), ) il.add_value( "content_html", '<img src="{}">'.format( response.xpath('//meta[@property="og:image"]/@content').extract_first() ), ) il.add_css("content_html", ".player-video-description-intro::text") return il.load_item()
def _parse_article(self, response): if response.status == 410: # Articles has been deleted. return remove_elems = [ '.bildtext .author', 'iframe', ] change_tags = { 'h1': 'h2' } il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='https://www.{}'.format(self.name), remove_elems=remove_elems, change_tags=change_tags, dayfirst=False, yearfirst=False) if response.css('.payment'): il.add_value('category', 'paywalled') il.add_css('link', 'link[rel="canonical"]::attr(href)') il.add_css('title', 'meta[property="og:title"]::attr(content)') il.add_css('author_name', '.druckheadline::text', re='·\s*(.*)\s*·') il.add_css('updated', 'meta[http-equiv="last-modified"]::attr(content)') il.add_css('content_html', '.druckcontent') il.add_value('path', response.meta['ressort']) yield il.load_item()
def parse_item(self, response): remove_elems = [ "aside", "script", "h1", "source", ".breadcrumbs", ".author-date", ".artikel-social-kommentar", ".bild-copyright", ".ressortTitleMobile", ".article-number", ".artikel-kommentarlink", ".umfrage-wrapper", ".articleIssueInfo", ] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, ) il.add_value("link", response.url) author_name = ( response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)") or "Red.") il.add_value("author_name", author_name) il.add_css("title", 'h1[itemprop="headline"]::text') il.add_value("updated", response.meta["updated"]) il.add_css("content_html", "article") return il.load_item()
def _parse_episode(self, response): il = FeedEntryItemLoader( response=response, base_url=f"https://{self.name}", timezone="Europe/Vienna", dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "title", '//meta[@name="title"]/@content', re=r"(?s)(.*?)(?: vom .*)? - puls4\.com", ) il.add_value( "updated", "{} {}".format( response.xpath('//meta[@name="title"]/@content').re_first( r".*vom (\d{2}\.\d{2}\.\d{4}).*"), response.meta["time"] or "00:00", ), ) il.add_value( "content_html", '<img src="{}">'.format( response.xpath( '//meta[@property="og:image"]/@content').extract_first()), ) il.add_css("content_html", ".player-video-description-intro::text") return il.load_item()
def parse_album(self, response): def _replace_track_info(elem): parts = list( map(lambda x: x.text_content().strip(), elem.getchildren())) return '<p>{} <i>({})</i></p>'.format(parts[0], parts[1]) title = response.xpath('//h1[@class="c-product-block__title"]//text()' ).extract()[-1].strip() artist = response.xpath( '//div[contains(@class,"c-product-block__contributors")]/p/text()' ).re_first('[^,]+') il = FeedEntryItemLoader( response=response, base_url="https://{}/".format(self.name), remove_elems=[ '.c-product-block__title', '.c-product__product-purchase', '.c-track__format-specific-info', '.c-track__duration', '.c-track__details', '.c-tracklist__initial-tracks', '.c-tabs-block__tabs-links', 'button' ], replace_elems={'.c-track__all-format-info': _replace_track_info}) il.add_value("title", '{} - {}'.format(artist, title)) il.add_value("link", response.url) il.add_value("author_name", 'bot') il.add_css("content_html", 'div.c-page--product') return il.load_item()
def parse_item(self, response): author_date = " ".join(response.css(".author-date ::text").extract()) match = re.search(r"von\s+(.*)", author_date) author_name = match.group(1) if match else "Red." remove_elems = [ "aside", "script", "h1", "source", ".breadcrumbs", ".author-date", ".artikel-social-kommentar", ".bild-copyright", ".ressortTitleMobile", ".article-number", ".artikel-kommentarlink", ".umfrage-wrapper", ".articleIssueInfo", "hr", "center div[style='padding: 10px; background:#efefef']", ] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, ) il.add_value("link", response.url) il.add_value("author_name", author_name) il.add_css("title", 'h1[itemprop="headline"]::text') il.add_value("updated", response.meta["updated"]) il.add_css("content_html", "article") return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url='{}/cms/'.format(self._link), timezone=self._timezone, remove_elems=['.news-latest-date', '.news-single-rightbox', 'hr', 'h7'], remove_elems_xpath=['//div[@class="news-single-item"]/b[1]', '//div[@class="news-single-item"]/br[1]'], ) il.add_value( 'title', response.xpath('//head/title/text()').re_first(r'::: (.*)')) il.add_value('link', response.url) il.add_value( 'updated', response.xpath('//div[@class="news-single-rightbox"]'). re_first(r'(\d{2}\.\d{2}\.\d{4})')) il.add_value( 'author_name', response.xpath('//head/meta[@name="publisher"]/@content'). re_first('recht.at, (.*);')) il.add_xpath('author_name', '//head/meta[@name="author"]/@content') il.add_value('author_name', self.name) il.add_xpath('author_email', '//head/meta[@name="reply-to"]/@content') il.add_css('content_html', '.news-single-item h7 font strong') il.add_css('content_html', '.news-single-item') yield il.load_item()
def parse_item(self, response): remove_elems = [ "aside", "script", "h1", "source", ".breadcrumbs", ".author-date", ".artikel-social-kommentar", ".bild-copyright", ".ressortTitleMobile", ".article-number", ".artikel-kommentarlink", ".umfrage-wrapper", ".articleIssueInfo", ] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, ) il.add_value("link", response.url) author_name = ( response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)") or "Red." ) il.add_value("author_name", author_name) il.add_css("title", 'h1[itemprop="headline"]::text') il.add_value("updated", response.meta["updated"]) il.add_css("content_html", "article") return il.load_item()
def _parse_article(self, response): if response.status == 410: # Articles has been deleted. return remove_elems = [".bildtext .author", "iframe"] change_tags = {"h1": "h2", ".bildbox": "figure", ".bildtext": "figcaption"} il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://www.{}".format(self.name), remove_elems=remove_elems, change_tags=change_tags, dayfirst=True, yearfirst=False, ) if response.css(".payment"): il.add_value("category", "paywalled") il.add_css("link", 'link[rel="canonical"]::attr(href)') il.add_css("title", 'meta[property="og:title"]::attr(content)') il.add_css("author_name", ".druckheadline::text", re=r"·\s*(.*)\s*·") # Mon, 01 Oct 18 13:42:45 +0200 il.add_css("updated", 'meta[http-equiv="last-modified"]::attr(content)') il.add_css("content_html", ".druckcontent") il.add_value("path", response.meta["ressort"]) return il.load_item()
def _parse_article(self, response): remove_elems = ['iframe', 'script'] il = FeedEntryItemLoader(response=response, parent=response.meta['il'], remove_elems=remove_elems, base_url='http://{}'.format(self.name)) il.add_css('content_html', '.entry-content') return il.load_item()
def _parse_article(self, response): remove_elems = ['#issue', 'h1', '#slogan', '#logo', '#footer'] il = FeedEntryItemLoader(response=response, parent=response.meta['il'], base_url='http://{}'.format(self.name), remove_elems=remove_elems) il.add_css('content_html', '#page') yield il.load_item()
def _parse_article(self, response): remove_elems = ["#issue", "h1", "#slogan", "#logo", "#footer"] il = FeedEntryItemLoader( response=response, parent=response.meta["il"], base_url="https://{}".format(self.name), remove_elems=remove_elems, ) il.add_css("content_html", "#page") return il.load_item()
def _parse_article(self, response): remove_elems = ["#issue", "h1", "#slogan", "#logo", "#footer"] il = FeedEntryItemLoader( response=response, parent=response.meta["il"], base_url=f"https://{self.name}", remove_elems=remove_elems, ) il.add_css("content_html", "#page") return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url="{}/".format(self.feed_link), timezone="Europe/Vienna", dayfirst=True, remove_elems=[".ruler", "h1"], ) il.add_css("title", "h1.event-title::text") il.add_value("link", response.url) il.add_css("content_html", "div#content.container") return il.load_item()
def parse_item_text(self, response): remove_elems = [".ad-component", ".wp-caption-text"] il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, base_url="https://{}".format(self.name), ) if response.css(".bluebox"): il.add_value("category", "paywalled") il.add_css("content_html", "div.pR") return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url="{}/".format(self.feed_link), timezone="Europe/Vienna", dayfirst=True, remove_elems=[".ruler", "h1"], ) il.add_css("title", "h1.event-title::text") il.add_value("link", response.url) il.add_css("content_html", "div#content.container") return il.load_item()
def _parse_weekly_edition(self, response): remove_elems = ["h1"] change_tags = { ".Cat1HL": "h1", ".Cat2HL": "h2", ".Cat3HL": "h3", ".SummaryHL": "h4", } il = FeedEntryItemLoader( response=response, parent=response.meta["il"], change_tags=change_tags, remove_elems=remove_elems, base_url=f"https://{self.name}", ) for url in response.css("h2.SummaryHL a::attr(href)").extract(): yield scrapy.Request( response.urljoin(url), self._parse_article, meta={ "il": None, "updated": response.meta["updated"] }, ) # Remove articles that have their own page. text = [] in_article = False for line in response.css(".ArticleText").extract_first().splitlines( True): # Beginning of article. if '<h2 class="SummaryHL"><a href="/Articles/' in line: in_article = True if not in_article: text.append(line) # End of article. Note that the links to the comments doesn't # always include "#comments" so we can't check for that. if '">Comments (' in line: in_article = False text = "".join(text) # Remove page editor. text = re.sub(r"<b>Page editor</b>: .*", "", text) # Recursively remove headings with no content. text = _remove_empty_headings(text) il.add_css("title", "h1::text") il.add_value("content_html", text) il.add_value("link", response.url) yield il.load_item()
def _parse_article(self, response): title = response.css('meta[property="og:title"]::attr(content)').extract_first() if not title: raise DropResponse( "Skipping {} because ran into bot detection".format(response.url), transient=True, ) remove_elems = [ "meta", ".ds-share-list", ".advert", ".layout-article-links", ".ds-chapter-list", ".layout-article-meta", ] change_tags = { ".article__lead-image": "figure", ".article__description": "h2", ".article__footnote": "i", } il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, change_tags=change_tags, ) il.add_value("link", response.url) il.add_value("title", title) il.add_css("updated", "time.article__dateline-datetime::attr('datetime')") il.add_css("content_html", ".article__lead-image") il.add_css("content_html", ".article__description") il.add_css("content_html", ".layout-article-body") il.add_value("path", response.meta["ressort"]) return il.load_item()
def _parse_article(self, response): def _fix_img_src(elem): if "data-original" in elem.attrib: elem.attrib["src"] = elem.attrib["data-original"] return elem remove_elems = [ ".credit", ".hide-caption", ".toggle-caption", ".enlarge-options", ".enlarge_measure", ".enlarge_html", ".ad-backstage", 'p:first-of-type:contains("Editor\'s Note: This is an excerpt of")', 'p:contains("Did you enjoy this newsletter segment?")', ] replace_elems = {"img": _fix_img_src} change_tags = {".image": "figure", ".credit-caption": "figcaption"} il = FeedEntryItemLoader( response=response, base_url=self._base_url, remove_elems=remove_elems, replace_elems=replace_elems, change_tags=change_tags, ) il.add_css("title", "h1 ::text") il.add_value("link", response.url) il.add_css("content_html", "#storytext") il.add_value("path", response.meta["path"]) il.add_css("updated", '.dateblock time::attr("datetime")') il.add_css("author_name", ".byline__name a::text") yield il.load_item()
def _parse_weekly_edition(self, response): remove_elems = ['h1'] change_tags = { '.Cat1HL': 'h1', '.Cat2HL': 'h2', '.Cat3HL': 'h3', '.SummaryHL': 'h4', } il = FeedEntryItemLoader(response=response, parent=response.meta['il'], change_tags=change_tags, remove_elems=remove_elems, base_url='https://{}'.format(self.name)) for url in response.css('h2.SummaryHL a::attr(href)').extract(): yield scrapy.Request(response.urljoin(url), self._parse_article, meta={ 'il': None, 'updated': response.meta['updated'] }) # Remove articles that have their own page. text = [] in_article = False for line in ( response.css('.ArticleText').extract_first().splitlines(True)): # Beginning of article. if '<h2 class="SummaryHL"><a href="/Articles/' in line: in_article = True if not in_article: text.append(line) # End of article. Note that the links to the comments doesn't # always include "#comments" so we can't check for that. if '">Comments (' in line: in_article = False text = ''.join(text) # Remove page editor. text = re.sub(r'<b>Page editor</b>: .*', '', text) # Recursively remove headings with no content. text = _remove_empty_headings(text) il.add_css('title', 'h1::text') il.add_value('content_html', text) il.add_value('link', response.url) yield il.load_item()
def _parse_article(self, response): remove_elems = ["script"] convert_footnotes = [".footnoteContent"] pullup_elems = {".footnoteContent": 1} change_tags = {".entry-content-info-box": "blockquote"} il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, change_tags=change_tags, base_url="https://{}".format(self.name), convert_footnotes=convert_footnotes, pullup_elems=pullup_elems, ) il.add_css("content_html", ".entry-content") return il.load_item()
def _parse_article(self, response): remove_elems = ["script"] convert_footnotes = [".footnoteContent"] pullup_elems = {".footnoteContent": 1} change_tags = {".entry-content-info-box": "blockquote"} il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, change_tags=change_tags, base_url="https://{}".format(self.name), convert_footnotes=convert_footnotes, pullup_elems=pullup_elems, ) il.add_css("content_html", ".entry-content") return il.load_item()
def parse_letter(self, response): account = response.meta["account"] il = FeedEntryItemLoader(response=response, base_url=self._links.get(account)) il.add_value("path", account) il.add_value("link", response.url) il.add_css("title", "title::text") il.add_css("author_name", "div#message-heading div.by-line a::text") il.add_css("updated", "div#message-heading div.date::text") il.add_css("content_html", "div.message-body") yield il.load_item()
def _parse_item(self, response): remove_elems = [ "h1", ".nono", ".acceptance_org", ".state", "script", ".gentics-portletreload-position-notvisibleposition", ] remove_elems_xpath = [ """ //div[ @class='advice' and child::div[@class='advice_text' and ( contains(., 'nicht die aktuelle Rechtslage') or contains(., 'wird nicht laufend aktualisiert') or contains(., 'Übersicht über bisherige "Themen des Monats"') )] ] """, # Remove table of contents. "//li[child::a[starts-with(@href, '#')]]", "//ul[not(li)]", ] change_tags = {"abbr": "span"} il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://www.{}".format(self.name), remove_elems=remove_elems, remove_elems_xpath=remove_elems_xpath, change_tags=change_tags, dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "author_name", '//div[@class="acceptance_org"]/text()[preceding-sibling::br]', ) il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)") il.add_value( "updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})") ) il.add_css("content_html", ".Content") return il.load_item()
def _parse_episode(self, response): il = FeedEntryItemLoader(response=response, base_url='http://{}'.format(self.name), timezone=self._timezone, dayfirst=True) il.add_value('link', response.url) il.add_xpath('title', '//meta[@name="title"]/@content', re='(?s)(.*?)(?: vom .*)? - puls4\.com') il.add_value('updated', '{} {}'.format( response.xpath('//meta[@name="title"]/@content'). re_first(r'.*vom (\d{2}\.\d{2}\.\d{4}).*'), response.meta['time'] or '00:00') ) il.add_value('content_html', '<img src="{}">'.format( response.xpath('//meta[@property="og:image"]/@content'). extract_first())) il.add_css('content_html', '.player-video-description-intro::text') yield il.load_item()
def parse_item(self, response): if response.status == 404: self.logger.info("Article '{}' not available anymore.".format( response.url)) return def _clean_caption(elem): if "–" in elem.text: # Caption is of the format "text - credit". elem.text = re.sub(r"\s*([^–]*).*", r"\1", elem.text) return elem else: # It's just the "credit", remove it. return None section = response.css('meta[name="kt:section-path"]::attr("content")' ).extract_first()[1:] # Skip the first /. if section not in self._sections and "all" not in self._sections: # Ignore the response as the ressort should not be parsed. return il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=[ ".ad", ".article-paid", ".js-overlay-close", ".swiper-lazy-preloader", ], change_tags={".article__lead": "strong"}, change_attribs={".zoomable__image--zoomed": { "data-src": "src" }}, replace_elems={".article__media-caption": _clean_caption}, base_url="https://www.{}".format(self.name), ) il.add_css( "author_name", "article .article__author ::text", re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL), ) il.add_css("content_html", "article .article__media .zoomable__inner") il.add_css("content_html", "article .article__lead") # change tags to strong il.add_css("content_html", "article .article__body") if response.css(".article-paid"): il.add_value("category", "paywalled") il.add_value("category", section.split("/")) if "all" in self._sections: il.add_value("path", "all") if section in self._sections: il.add_value("path", section) return il.load_item()
def _parse_item(self, response): remove_elems = [ "h1", ".nono", ".acceptance_org", ".state", "script", ".gentics-portletreload-position-notvisibleposition", ] remove_elems_xpath = [ """ //div[ @class='advice' and child::div[@class='advice_text' and ( contains(., 'nicht die aktuelle Rechtslage') or contains(., 'wird nicht laufend aktualisiert') or contains(., 'Übersicht über bisherige "Themen des Monats"') )] ] """, # Remove table of contents. "//li[child::a[starts-with(@href, '#')]]", "//ul[not(li)]", ] change_tags = {"abbr": "span"} il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://www.{}".format(self.name), remove_elems=remove_elems, remove_elems_xpath=remove_elems_xpath, change_tags=change_tags, dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "author_name", '//div[@class="acceptance_org"]/text()[preceding-sibling::br]', ) il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)") il.add_value("updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})")) il.add_css("content_html", ".Content") return il.load_item()
def _parse_article(self, response): remove_elems = ["h1", "#contents", ".headerlink"] change_tags = {".admonition-title": "h2"} il = FeedEntryItemLoader( response=response, base_url=response.url, remove_elems=remove_elems, change_tags=change_tags, ) il.add_value("link", response.url) il.add_value("author_name", "Brandon Rhodes") # Use "Last-Modified" field or fall back to "Date". updated = ( response.headers.get("Last-Modified", response.headers.get("Date")) ).decode("ascii") il.add_value("updated", updated) il.add_css("title", "title::text") il.add_css("content_html", ".section") return il.load_item()
def _parse_article(self, response): remove_elems = [ ".FeatureByline", ".GAByline", ".Form", "form", ".MakeALink", "br", ] change_tags = {"div.BigQuote": "blockquote"} il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, change_tags=change_tags, base_url=f"https://{self.name}", ) text = response.css(".ArticleText").extract_first() # Remove 'Log in to post comments'. text = re.sub(r'<hr width="60%" align="left">.*to post comments\)', "", text, flags=re.S) il.add_css("title", "h1::text") il.add_value("content_html", text) il.add_css("author_name", ".FeatureByline b ::text") il.add_css("author_name", ".GAByline a ::text") il.add_css( "author_name", ".GAByline p ::text", re="This article was contributed by (.*)", ) il.add_xpath( "updated", '//div[@class="FeatureByline"]/text()[preceding-sibling::br]', TakeFirst(), ) il.add_xpath("updated", '//div[@class="GAByline"]/p[1]/text()') # Last resort if date cannot be extracted and it's a weekly edition. if "updated" in response.meta: il.add_value("updated", response.meta["updated"]) if response.css(".MakeALink"): # Get subscriber link for paywalled content. return scrapy.FormRequest.from_response( response, formcss=".MakeALink form", callback=self._subscriber_link, meta={"il": il}, ) else: il.add_value("link", response.url) return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url="{}/cms/".format(self.feed_link), timezone="Europe/Vienna", remove_elems=[".news-latest-date", ".news-single-rightbox", "hr", "h7"], remove_elems_xpath=[ '//div[@class="news-single-item"]/b[1]', '//div[@class="news-single-item"]/br[1]', ], dayfirst=True, ) il.add_value( "title", response.xpath("//head/title/text()").re_first(r"::: (.*)") ) il.add_value("link", response.url) il.add_value( "updated", response.xpath('//div[@class="news-single-rightbox"]').re_first( r"(\d{2}\.\d{2}\.\d{4})" ), ) il.add_value( "author_name", response.xpath('//head/meta[@name="publisher"]/@content').re_first( "recht.at, (.*);" ), ) il.add_xpath("author_name", '//head/meta[@name="author"]/@content') il.add_value("author_name", self.name) il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content') il.add_css("content_html", ".news-single-item h7 font strong") il.add_css("content_html", ".news-single-item") return il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url="{}/cms/".format(self.feed_link), timezone="Europe/Vienna", remove_elems=[ ".news-latest-date", ".news-single-rightbox", "hr", "h7" ], remove_elems_xpath=[ '//div[@class="news-single-item"]/b[1]', '//div[@class="news-single-item"]/br[1]', ], dayfirst=True, ) il.add_value( "title", response.xpath("//head/title/text()").re_first(r"::: (.*)")) il.add_value("link", response.url) il.add_value( "updated", response.xpath('//div[@class="news-single-rightbox"]').re_first( r"(\d{2}\.\d{2}\.\d{4})"), ) il.add_value( "author_name", response.xpath('//head/meta[@name="publisher"]/@content').re_first( "recht.at, (.*);"), ) il.add_xpath("author_name", '//head/meta[@name="author"]/@content') il.add_value("author_name", self.name) il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content') il.add_css("content_html", ".news-single-item h7 font strong") il.add_css("content_html", ".news-single-item") return il.load_item()
def parse_article(self, response): site = response.meta["site"] il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url=self._links[site], dayfirst=True, yearfirst=False, remove_elems=[ "div.main-content h1:first-of-type", "p#ctl00_ctl00_ctl00_cph_col_a_cph_content_cph_content_detail_p_date", "div#main-content-header", ], ) il.add_value("path", site) il.add_value("link", response.url) il.add_value("updated", response.meta["updated"]) il.add_css("title", "div.main-content h1:first-of-type::text") il.add_css("content_html", "div.main-content") yield il.load_item()
def _parse_article(self, response): remove_elems = [ ".caption-credit", ".gallery-image-credit", "#social-left", "ul.toc", "h3:contains('Table of Contents')", "br", ".sidebar:contains('Further Reading')", ".credit", ] change_tags = {".sidebar": "blockquote", "aside": "blockquote"} replace_elems = {"div.image": self._div_to_img} il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, replace_elems=replace_elems, change_tags=change_tags, ) if response.meta.get("first_page", False): il.add_value("link", response.url) il.add_css("author_name", ".byline a span ::text") il.add_css("content_html", "header h2") il.add_value("path", response.meta["path"]) il.add_css("content_html", ".article-content") if response.css(".next"): return scrapy.Request( response.css(".numbers a::attr(href)").extract()[-1], self._parse_article, meta={"il": il, "path": response.meta["path"]}, ) else: return il.load_item()
def _parse_video_page(self, response): match = re.search( r"https?://(?:www\.)?servustv\.com/videos/(?P<id>[aA]{2}-\w+|\d+-\d+)", response.url, ) if not match: return video_id = match.group("id").upper() il = FeedEntryItemLoader(response=response) il.add_value("link", response.url) section = response.css( "meta[property='article:section']::attr('content')").extract_first( ) if section != "Allgemein": il.add_value("title", section) il.add_css("title", "title::text", re="(.*) - Servus TV") image_url = response.css( "meta[property='og:image']::attr('content')").extract_first() il.add_value("content_html", '<img src="{}">'.format(image_url)) il.add_css("content_html", "meta[property='og:description']::attr('content')") il.add_css("content_html", "#media-asset-content-container") match = re.search(r'"dateModified":\s*"([^"]+)"', response.text) if match: il.add_value("updated", match.group(1)) stream_url = "https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8" % video_id yield Request(stream_url, self._parse_stream, meta={"il": il})
def _parse_item(self, response): remove_elems = [ 'h1', '.nono', '.acceptance_org', '.state', 'script', '.gentics-portletreload-position-notvisibleposition' ] remove_elems_xpath = [ """ //div[ @class='advice' and child::div[@class='advice_text' and ( contains(., 'nicht die aktuelle Rechtslage') or contains(., 'wird nicht laufend aktualisiert') or contains(., 'Übersicht über bisherige "Themen des Monats"') )] ] """, # Remove table of contents. "//li[child::a[starts-with(@href, '#')]]", "//ul[not(li)]", ] change_tags = { 'abbr': 'span', } il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='https://www.{}'.format(self.name), remove_elems=remove_elems, remove_elems_xpath=remove_elems_xpath, change_tags=change_tags, dayfirst=True) il.add_value('link', response.url) il.add_xpath( 'author_name', '//div[@class="acceptance_org"]/text()[preceding-sibling::br]', ) il.add_css('title', 'title::text', re=r'HELP.gv.at:\s*(.*)') il.add_value('updated', response.css('.state').re_first(r'(\d{2}\.\d{2}\.\d{4})')) il.add_css('content_html', '.Content') yield il.load_item()
def _parse_article_url(self, response): if 'Fehler' in response.css('h2 ::text').extract_first(): self.logger.info('Skipping {} as it returned an error'.format( response.url)) return remove_elems = ['div[style="padding-top:10px;"]'] il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='http://{}'.format(self.name), dayfirst=True, remove_elems=remove_elems) il.add_value('link', response.url) il.add_value('author_name', 'VKI') date = response.css('.issue').re_first( 'veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})') il.add_value('updated', date) url = (response.xpath('//a[text()="Druckversion"]/@onclick').re_first( r"window\.open\('(.*)'\);")) il.add_css('title', 'h1::text') if url: yield scrapy.Request(response.urljoin(url), callback=self._parse_article, meta={'il': il}) else: il.add_value('category', 'paywalled') il.add_css('content_html', '.primary') il.add_css('content_html', 'div[style="padding-top:10px;"] > h3') yield il.load_item()
def parse_item(self, response): remove_elems = [ 'aside', 'script', 'h1', '.breadcrumbs', '.author-date', '.artikel-social-kommentar', '.bild-copyright', '.ressortTitleMobile', '.article-number', '.artikel-kommentarlink', '.umfrage-wrapper', '.articleIssueInfo', ] il = FeedEntryItemLoader(response=response, timezone=self._timezone, base_url='http://{}'.format(self.name), remove_elems=remove_elems) il.add_value('link', response.url) author_name = ( response.css('.author-date ::text').re(r'(?:Von)?\s*(\w+ \w+)') or 'Red.') il.add_value('author_name', author_name) il.add_css('title', 'h1[itemprop="headline"]::text') il.add_css('updated', 'meta[property="article:published_time"]::attr(content)', re='([^+]*)') il.add_css('content_html', 'article') yield il.load_item()
def parse_login_issue(self, response): remove_elems = [ '.field-name-field-file-access', '.field-name-field-login-issue-file', '.field-name-field-product', '.field-commerce-price', '.views-field-field-file-access', '.view-header', ] il = FeedEntryItemLoader(response=response, base_url='https://www.{}'.format(self.name), remove_elems=remove_elems, dayfirst=True) il.add_value('link', response.url) title = response.css('h1::text').extract_first().strip() il.add_value('title', title) il.add_value('updated', self._date_from_title(title)) il.add_css('content_html', '.content-wrapper') il.add_value('path', 'login') if response.css('.usenix-files-protected'): il.add_value('category', 'paywalled') yield il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader(response=response, base_url=self._base_url) il.add_value("updated", response.meta["updated"]) il.add_value("author_name", response.meta["author_name"]) il.add_value("link", response.url) il.add_css("title", "title::text", re="(.*) - The Oatmeal") il.add_value("category", urlsplit(response.url).path.strip("/").split("/")[0]) # comics il.add_css("content_html", "#comic > img") il.add_css("content_html", "#comic > p > img") # blog il.add_css("content_html", "#blog .center_text img") return il.load_item()
def parse(self, response): if len(response.css(".thumbnail")) == 0: self.logger.info("No items found.") return for item in response.css(".thumbnail"): il = FeedEntryItemLoader(selector=item, base_url=self._base_url) il.add_css("title", ".item_brand_text ::text") il.add_css("title", ".item-title ::text") il.add_css("title", ".current-price ::text") il.add_value( "link", response.urljoin(item.css(".item-link::attr(href)").extract_first()), ) image_url = item.css(".item-image::attr(data-bg)").re_first( r"url\(([^)]+)\)" ) # Fix broken images. if image_url.startswith("https://markenankauf.momox.de/pics/https://"): image_url = image_url.replace( "https://markenankauf.momox.de/pics/https://", "https://" ) il.add_value("content_html", '<img src="{}">'.format(image_url)) il.add_css("content_html", ".item-des-container") il.add_value("path", response.meta["path"]) yield il.load_item() page = int(response.css(".pagination .active a::text").extract_first()) if page == 1: yield generate_feed_header( title=response.css("title ::text").re_first( "(ubup | .*) Second Hand kaufen" ), subtitle="Deutschlands größter Second Hand-Onlineshop für " "Mode & Accessoires", icon="https://www.{}/images/favicon.ico".format(self.name), link=response.url, path=response.meta["path"], ) if page < self._scrape_pages: next_page = response.css( ".pagination .active + li a::attr(href)" ).extract_first() if next_page: yield scrapy.Request( response.urljoin(next_page), meta={"dont_cache": True, "path": response.meta["path"]}, )
def _parse_restaurant(self, response): il = FeedEntryItemLoader( response=response, base_url=response.url, parent=response.meta["il"], remove_elems=[".external"], ) il.add_css("content_html", ".content .right p") il.add_css("content_html", ".restaurant-link") il.add_css("category", ".tags a ::text") yield il.load_item()
def parse_item(self, response): if response.status == 404: self.logger.info("Article '{}' not available anymore.".format(response.url)) return def _clean_caption(elem): if "–" in elem.text: # Caption is of the format "text - credit". elem.text = re.sub(r"\s*([^–]*).*", r"\1", elem.text) return elem else: # It's just the "credit", remove it. return None section = response.css( 'meta[name="kt:section-path"]::attr("content")' ).extract_first()[ 1: ] # Skip the first /. if section not in self._sections and "all" not in self._sections: # Ignore the response as the ressort should not be parsed. return il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=[ ".ad", ".article-paid", ".js-overlay-close", ".swiper-lazy-preloader", ], change_tags={".article__lead": "strong"}, change_attribs={".zoomable__image--zoomed": {"data-src": "src"}}, replace_elems={".article__media-caption": _clean_caption}, base_url="https://www.{}".format(self.name), ) il.add_css( "author_name", "article .article__author ::text", re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL), ) il.add_css("content_html", "article .article__media .zoomable__inner") il.add_css("content_html", "article .article__lead") # change tags to strong il.add_css("content_html", "article .article__body") if response.css(".article-paid"): il.add_value("category", "paywalled") il.add_value("category", section.split("/")) if "all" in self._sections: il.add_value("path", "all") if section in self._sections: il.add_value("path", section) return il.load_item()
def _parse_user_profile(self, response): self._users[response.meta["user_id"]] = ( response.css("#up_user h2::text").extract_first().strip() ) for posting in response.css(".posting"): il = FeedEntryItemLoader( selector=posting, base_url="https://{}".format(self.name), change_tags={"span": "p"}, ) il.add_css("title", ".text strong::text") il.add_css("link", '.text a::attr("href")') il.add_value( "updated", datetime.utcfromtimestamp( int(posting.css('.date::attr("data-timestamp")').extract_first()) / 1000 ), ) il.add_css("content_html", ".text span") il.add_css("content_html", ".article h4") il.add_value("path", response.meta["path"]) yield il.load_item()
def _parse_interview(self, response): remove_elems = [ ".shareable-quote", ".share-bar", # Remove the last two h2s and all paragraphs below. ".interview-body > h2:last-of-type ~ p", ".interview-body > h2:last-of-type", ".interview-body > h2:last-of-type ~ p", ".interview-body > h2:last-of-type", ] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, ) il.add_value("link", response.url) il.add_css("title", "h1::text") il.add_css("author_name", "header .user-link__name::text") il.add_css("content_html", ".interview-body") il.add_value("updated", response.meta["updated"]) return il.load_item()
def _parse_article_url(self, response): if not response.css("#content"): raise DropResponse( "Skipping {} since it is empty".format(response.url), transient=True ) if "Fehler" in response.css("h2 ::text").extract_first(): raise DropResponse( "Skipping {} since it returned an error".format(response.url), transient=True, ) remove_elems = ['div[style="padding-top:10px;"]'] il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://{}".format(self.name), dayfirst=True, remove_elems=remove_elems, ) il.add_value("link", response.url) il.add_value("author_name", "VKI") date = response.css(".issue").re_first( r"veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})" ) il.add_value("updated", date) url = response.xpath('//a[text()="Druckversion"]/@onclick').re_first( r"window\.open\('(.*)'\);" ) il.add_css("title", "h1::text") if url: return scrapy.Request( response.urljoin(url), callback=self._parse_article, meta={"il": il} ) else: il.add_value("category", "paywalled") il.add_css("content_html", ".primary") il.add_css("content_html", 'div[style="padding-top:10px;"] > h3') return il.load_item()
def _parse_article(self, response): def _inline_video(videos, elem): if "data-video-id" in elem.attrib: source = lxml.etree.Element("source") source.attrib["src"] = videos[elem.attrib["data-video-id"]] source.attrib["type"] = "video/mp4" elem.insert(0, source) return elem else: # Header video, replace with placeholder image. parent = elem.getparent() parent.tag = "figure" if "data-placeholderbig" in elem.attrib: src = elem.attrib["data-placeholderbig"] else: src = elem.attrib["data-placeholder"] image = lxml.etree.Element("img") image.attrib["src"] = src return image def _inline_picture(elem): elem.tag = "img" src = elem.attrib.get("data-original") data_min_width = 1000 if src else -1 for child in elem.getchildren(): if child.tag != "span": continue if int(child.attrib.get("data-min-width", 0)) > data_min_width: src = child.attrib["data-src"] data_min_width = int(child.attrib.get("data-min-width", 0)) child.drop_tree() elem.attrib["src"] = src return elem audio_ids = response.css( '#BCaudioPlayer_eindeutig::attr("data-video-id")' ).extract() video_ids = response.css('.video-js::attr("data-video-id")').extract() media = {} for media_id in audio_ids + video_ids: api_response = yield self._build_api_request(media_id) api_response = json.loads(api_response.text) media[media_id] = sorted( ( video for video in api_response["sources"] if "src" in video and video.get("container") == "MP4" ), key=lambda v: v["size"], )[-1]["src"] remove_elems = [ "h1", "script", "style", ".projectNav", ".socialShare", ".socialShare__headline", ".socialShare__icon", ".socialMedia", ".socialMedia__headline", ".whyRead", ".overlayCTA", ".authors", ".sectionBackground--colorTheme1", ".heroStage__copyright", ".heroStage__downLink", ".callToAction", ".print-action", ".internalLink span", ".addCommunity", ".download", ".BCaudioPlayer", ".icon-date", ".callToAction__button", 'a[href^="http://partners.webmasterplan.com/click.asp"]', ".relatedSlider", ".imageLightbox", ".image__copyrightWrapper", ".image__zoom", ".image > .picture", ".imageHC", ] change_tags = { "div.heroStage__introText": "strong", ".quote": "blockquote", ".quote__label": "footer", ".supernumber": "blockquote", ".image": "figure", ".image__element": "div", } replace_elems = { "video": partial(_inline_video, media), ".picture": _inline_picture, } pullup_elems = {".image__content figcaption": 3} il = FeedEntryItemLoader( response=response, base_url=response.url, remove_elems=remove_elems, change_tags=change_tags, replace_elems=replace_elems, pullup_elems=pullup_elems, ) il.add_value("link", response.url) il.add_css("author_name", ".sidebar .authors__name::text") il.add_css("title", "title::text", re="(.*) - Addendum") il.add_css("updated", 'meta[property="article:modified_time"]::attr(content)') # If not yet modified: il.add_css("updated", 'meta[property="article:published_time"]::attr(content)') il.add_css("content_html", ".content") for medium_id, medium_url in media.items(): if medium_id not in audio_ids: il.add_value("enclosure", {"iri": medium_url, "type": "video/mp4"}) item = il.load_item() # Save a copy before yielding it. item_podcast = deepcopy(item) yield item if audio_ids: # Export to podcast feed. il = FeedEntryItemLoader(item=item_podcast) il.add_value("path", "podcast") for medium_id, medium_url in media.items(): if medium_id in audio_ids: il.add_value("enclosure", {"iri": medium_url, "type": "audio/mp4"}) yield il.load_item()
def _parse_article(self, response): # Heuristic for news.ORF.at to to detect teaser articles. more = self._extract_link( response.css( ".story-story p > strong:contains('Mehr') + a::attr(href), " + ".story-story p > a:contains('Lesen Sie mehr')::attr(href)" ).extract_first() ) if more and more != response.url: self.logger.debug("Detected teaser article, redirecting to {}".format(more)) response = yield scrapy.Request(more, meta=response.meta) remove_elems = [ ".byline", "h1", ".socialshare", ".socialShareWrapper", ".socialButtons", ".credit", ".toplink", ".offscreen", ".storyMeta", "script", ".oon-youtube-logo", ".vote", # redesign "#more-to-read-anchor", ".social-buttons", ".story-horizontal-ad", ".linkcard", ] pullup_elems = { ".remote .slideshow": 1, ".remote .instagram": 1, ".remote .facebook": 1, ".remote .twitter": 1, ".remote .youtube": 1, ".remote table": 1, } replace_elems = { ".video": "<p><em>Hinweis: Das eingebettete Video ist nur im Artikel " + "verfügbar.</em></p>" } change_attribs = {"img": {"data-src": "src", "srcset": "src"}} change_tags = { ".image": "figure", ".caption": "figcaption", ".fact": "blockquote", # FM4 } author, author_selector = self._extract_author(response) if author: self.logger.debug("Extracted possible author '{}'".format(author)) # Remove the paragraph that contains the author. remove_elems.insert(0, author_selector) else: self.logger.debug("Could not extract author name") author = "{}.ORF.at".format(response.meta["path"]) for slideshow in response.css(".slideshow"): link = response.urljoin( slideshow.css('::attr("data-slideshow-json-href")').extract_first() ).replace("jsonp", "json") slideshow_id = slideshow.css('::attr("id")').extract_first() slideshow_response = yield scrapy.Request(link) replace_elems["#{}".format(slideshow_id)] = self._create_slideshow_html( slideshow_response ) il = FeedEntryItemLoader( response=response, remove_elems=remove_elems, pullup_elems=pullup_elems, replace_elems=replace_elems, change_attribs=change_attribs, change_tags=change_tags, ) # The field is part of a JSON that is sometimes not valid, so don't bother with # parsing it properly. match = re.search(r'"datePublished": "([^"]+)"', response.text) if match: # news.ORF.at updated = match.group(1) else: # other updated = response.meta["updated"] il.add_value("updated", updated) il.add_css("title", "title::text", re=re.compile(r"(.*) - .*", flags=re.S)) il.add_value("link", response.url) il.add_css("content_html", ".opener img") # FM4, news il.add_css("content_html", ".story-lead-text") # news il.add_css("content_html", "#ss-storyText") il.add_css("content_html", "#ss-storyContent") # news il.add_value("author_name", author) if author in self._authors: il.add_value("path", author) il.add_value("path", response.meta["path"]) il.add_value("category", response.meta["categories"]) yield il.load_item()
def _parse_article(self, response): def _fix_img_src(elem): src = elem.attrib.pop("data-zoom-src", None) # data-zoom-src is only valid if it starts with //images.derstandard.at. if src and src.startswith("//images.derstandard.at"): elem.attrib["src"] = src elem.attrib.pop("width", None) elem.attrib.pop("height", None) elem.attrib.pop("class", None) return elem remove_elems = [ ".credits", ".owner-info", ".image-zoom", ".continue", ".sequence-number", ".js-embed-output", "#mycountrytalks-embed", # Remove self-promotion for (other) ressorts. '.js-embed-output-feeds a[href^="/r"]', '.js-embed-output-feeds a[href^="https://derstandard.at/"]', ( ".js-embed-output-feeds " + 'img[src="https://images.derstandard.at/2018/10/18/' + 'Immobiliensuche202x122.png"]' ), ] change_tags = { "#media-list li .description": "figcaption", "#media-list li": "figure", "#media-list": "div", ".photo": "figure", ".caption": "figcaption", } replace_elems = { ".embedded-posting": "<p><em>Hinweis: Das eingebettete Posting ist nur " + "im Artikel verfügbar.</em></p>", # Replace every special script container with its unescaped content. "script.js-embed-template": lambda elem: ( '<div class="js-embed-output-feeds">' + html.unescape(elem.text or "") + "</div>" ), "img": _fix_img_src, } il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, change_tags=change_tags, replace_elems=replace_elems, ) il.add_value("link", response.url) il.add_css("title", 'meta[property="og:title"]::attr(content)') for author in response.css("span.author::text").extract(): # Sometimes the author name is messed up and written in upper case. # This happens usually for articles written by Günter Traxler. if author.upper() == author: author = author.title() il.add_value("author_name", author) il.add_value("path", response.meta["ressort"]) il.add_value("updated", response.meta["updated"]) il.add_css("category", "#breadcrumb .item a::text") blog_id = response.css("#userblogentry::attr(data-objectid)").extract_first() if blog_id: url = ( "https://{}/userprofil/bloggingdelivery/blogeintrag?godotid={}" ).format(self.name, blog_id) return scrapy.Request(url, self._parse_blog_article, meta={"il": il}) elif response.css("#feature-content"): cover_photo = response.css("#feature-cover-photo::attr(style)").re_first( r"\((.*)\)" ) il.add_value("content_html", '<img src="{}">'.format(cover_photo)) il.add_css("content_html", "#feature-cover-title h2") il.add_css("content_html", "#feature-content > .copytext") return il.load_item() else: il.add_css("content_html", "#content-aside") il.add_css("content_html", "#objectContent > .copytext") il.add_css("content_html", "#content-main > .copytext") il.add_css("content_html", ".slide") return il.load_item()