def _parse_article(self, response): remove_elems = ["#issue", "h1", "#slogan", "#logo", "#footer"] il = FeedEntryItemLoader( response=response, parent=response.meta["il"], base_url="https://{}".format(self.name), remove_elems=remove_elems, ) il.add_css("content_html", "#page") return il.load_item()
def _parse_news(self, response): il = FeedEntryItemLoader(response=response, parent=response.meta["il"]) il.add_xpath( "content_html", '//div[@class="newsheader" and .//a[@id="{}"]]' '/following-sibling::div[@class="newsinner"]'.format( response.meta["news_id"] ), ) return il.load_item()
def parse(self, response): m = re.search("window.DELINSKI, {listViewEntities: (.*)}", response.text) restaurants = sorted( json.loads(m.group(1))["restaurants"]["entities"].values(), key=lambda r: int(r["created"]), reverse=True, ) for restaurant in restaurants[:20]: il = FeedEntryItemLoader(timezone="UTC", base_url=response.url) url = response.urljoin(restaurant["url"]) il.add_value("link", url) il.add_value("title", restaurant["name"]) content = """ <img src="{image}"> <ul> <li>{address}</li> <li>{price_range_human}</li> <li>{cuisine_text}</li> </ul> """ il.add_value("content_html", content.format(**restaurant)) il.add_value( "updated", datetime.utcfromtimestamp(int(restaurant["created"])) ) yield scrapy.Request(url, self._parse_restaurant, meta={"il": il})
def parse_item(self, response): remove_elems = ["h1", ".delayed-image-load"] change_tags = {"noscript": "div"} il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, change_tags=change_tags, base_url="https://www.{}".format(self.name), ) il.add_xpath("content_html", '//div[@id="main-inner"]') return il.load_item()
def parse_node(self, response, node): url = node.xpath("rss:loc/text()").extract_first() il = FeedEntryItemLoader(selector=node) il.add_value("link", url) il.add_xpath("title", "news:news/news:title/text()") keywords = node.xpath("news:news/news:keywords/text()").extract_first() if keywords: il.add_value("category", keywords.split(", ")) il.add_xpath("updated", "news:news/news:publication_date/text()") return scrapy.Request( url, self.parse_item, meta={"il": il, "handle_httpstatus_list": [404]} )
def _parse_article(self, response): remove_elems = ["script"] convert_footnotes = [".footnoteContent"] pullup_elems = {".footnoteContent": 1} change_tags = {".entry-content-info-box": "blockquote"} il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, change_tags=change_tags, base_url="https://{}".format(self.name), convert_footnotes=convert_footnotes, pullup_elems=pullup_elems, ) il.add_css("content_html", ".entry-content") return il.load_item()
def _parse_restaurant(self, response): il = FeedEntryItemLoader( response=response, base_url=response.url, parent=response.meta["il"], remove_elems=[".external"], ) il.add_css("content_html", ".content .right p") il.add_css("content_html", ".restaurant-link") il.add_css("category", ".tags a ::text") yield il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url="{}/".format(self.feed_link), timezone="Europe/Vienna", dayfirst=True, remove_elems=[".ruler", "h1"], ) il.add_css("title", "h1.event-title::text") il.add_value("link", response.url) il.add_css("content_html", "div#content.container") return il.load_item()
def parse(self, response): mitteilungsblaetter = response.css(".mitteilungsblaetter") updated = mitteilungsblaetter.css("::text").re_first(r"(\d{2}\.\d{2}\.\d{4})") link = response.urljoin( mitteilungsblaetter.css('a::attr("href")').extract_first() ) response = yield scrapy.Request(link, method="HEAD") mb_url = response.url match = re.search( r"https://tiss.tuwien.ac.at/mbl/blatt_struktur/anzeigen/(\d+)", mb_url ) if not match: self.logger.error("No Mitteilungsblätter found!") return else: mb_id = match.group(1) url = "https://tiss.{}/api/mbl/v22/id/{}".format(self.name, mb_id) response = yield scrapy.Request(url) last_entry = None for entry in reversed(json.loads(response.text)["knoten"]): (entry["main"], entry["sub"]) = re.match( r"(\d+)\.?(\d*)", entry["counter"] ).groups() if last_entry is not None and last_entry["main"] == entry["main"]: entry["inhalt"] += "<h2>{}</h2>".format(last_entry["titel"]) entry["inhalt"] += last_entry["inhalt"] if entry["sub"] == "": il = FeedEntryItemLoader( base_url="https://tiss.{}".format(self.name), timezone="Europe/Vienna", dayfirst=True, ) il.add_value("updated", updated) il.add_value("link", mb_url + "#{}".format(entry["counter"])) il.add_value("title", entry["titel"]) il.add_value("content_html", entry["inhalt"]) yield il.load_item() last_entry = None else: last_entry = entry
def parse_program(self, response): if not response.css(r".jsb_video\/FlashPlayer"): return data = json.loads( response.css(r".jsb_video\/FlashPlayer").xpath("@data-jsb").extract()[0] ) data = data["config"]["initial_video"]["parts"][0]["tracking"]["nurago"] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), timezone="Europe/Vienna", dayfirst=True, ) il.add_value("link", data["clipurl"]) il.add_value("title", data["programname"]) il.add_value("updated", data["airdate"]) il.add_xpath("content_html", '//p[@class="plot_summary"]') item = il.load_item() # Only include videos posted in the last 7 days. if item["updated"] + self._timerange > datetime.now(timezone.utc): return item
def parse_release_changelog(self, response): il = FeedEntryItemLoader( response=response, parent=response.meta["il"], base_url=self._base_url ) il.add_value("content_html", "<h1>Detailed Changelog</h1>") il.add_xpath("content_html", "//h1/following-sibling::*") return il.load_item()
def parse_node(self, response, node): link = node.xpath("link/text()").extract_first() il = FeedEntryItemLoader() il.add_value("title", node.xpath("title/text()").extract_first()) il.add_value("updated", node.xpath("pubDate/text()").extract_first()) il.add_value("category", node.xpath("category/text()").extract()) return scrapy.Request( link, self._parse_article, cookies={"view": "mobile"}, meta={"il": il, "path": response.meta["path"], "first_page": True}, )
def parse(self, response): # Wiener Linien returns HTML with an XML content type which creates an # XmlResponse. response = HtmlResponse(url=response.url, body=response.body) for item in response.css(".block-news-item"): il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", ignoretz=True, base_url="https://www.{}".format(self.name), ) link = response.urljoin(item.css("a::attr(href)").extract_first()) il.add_value("link", link) il.add_value("title", item.css("h3::text").extract_first()) il.add_value("updated", item.css(".date::text").extract_first()) yield scrapy.Request(link, self.parse_item, meta={"il": il})
def parse_item_text(self, response): remove_elems = [".dachzeile", "h1", ".meta", "br", "form", ".button-container"] il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, base_url="https://{}".format(self.name), ) content = response.xpath("//article").extract_first() if "Lesen Sie diesen Artikel in voller Länge" in content: il.add_value("category", "paywalled") il.add_value("content_html", content) return il.load_item()
def _parse_article(self, response): feed_entry = response.meta["feed_entry"] il = FeedEntryItemLoader(parent=response.meta["il"]) doc = Document(response.text, url=response.url) il.add_value("title", doc.short_title() or feed_entry.get("title")) summary = feed_entry.get("summary") try: content = doc.summary(html_partial=True) if summary and len(summary) > len(content): # Something probably went wrong if the extracted content is shorter than # the summary. raise Unparseable except Unparseable: content = summary il.add_value("content_html", content) return il.load_item()
def parse_lokalfuehrer(self, response): entries = json.loads(response.text)[0]["hits"] for entry in entries: il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name) ) il.add_value( "path", "lokalfuehrer_{}".format(response.meta["lokalfuehrer"]) ) il.add_value( "link", "https://www.{}/lokal/{}".format(self.name, entry["id"]) ) il.add_value("category", entry["categories"]) il.add_value("category", entry["zip"]) il.add_value("category", entry["city"]) review = entry.get("review") if review: il.add_value("title", review["post_title"]) il.add_value("title", review["post_subtitle"]) il.add_value("author_name", review["meta"].split("|")[0].title()) il.add_value("category", "review") il.add_value("updated", review["post_date"]) else: il.add_value("title", entry["name"]) if "pictures" in entry and entry["pictures"]: il.add_value( "content_html", '<img src="https://fcc.at/ef/img720/{}">'.format( entry["pictures"][0]["filename"] ), ) if review: il.add_value("content_html", review["post_content"]) il.add_value("content_html", entry["category_text"]) il.add_value( "content_html", "<p>{} {}, {}</p>".format(entry["zip"], entry["city"], entry["street"]), ) if entry["location"]: il.add_value( "content_html", ( '<p><a href="https://www.google.com/maps?q={lat},{lon}">' + "Google Maps</a></p>" ).format(**entry["location"]), ) yield il.load_item()
def parse_archive_search(self, response): for i, item in enumerate(json.loads(response.text)["result"]["hits"]): il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), timezone="Europe/Vienna", ) il.add_value("path", "magazine") link = response.urljoin(item["detail_link"]) il.add_value("link", link) try: author = re.sub( r"(?:.*:|Von)\s*(.*)", r"\1", ", ".join(item["authors"]).title() ) il.add_value("author_name", author) except IndexError: pass il.add_value("title", item["title"]) # All articles have the same date. # We add an offset so they are sorted in the right order. date = response.meta["issue_date"] + timedelta(seconds=i) il.add_value("updated", date) yield scrapy.Request(link, self.parse_item_text, meta={"il": il})
def parse_content(self, response): parts = self._extract_parts(response) il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", dayfirst=True ) il.add_value("path", self._library) il.add_value("title", " - ".join(parts[: self._find_first_meta(parts)])) il.add_value("link", response.url) il.add_xpath("updated", "//td/span/text()", re="In der Bibliothek seit: (.*)") _content = ["<ul>"] for part in parts: _content.append("<li>{}</li>".format(part)) _content.append("</ul>") il.add_value("content_html", "".join(_content)) return il.load_item()
def _parse_article(self, response): remove_elems = ["h1", "#contents", ".headerlink"] change_tags = {".admonition-title": "h2"} il = FeedEntryItemLoader( response=response, base_url=response.url, remove_elems=remove_elems, change_tags=change_tags, ) il.add_value("link", response.url) il.add_value("author_name", "Brandon Rhodes") # Use "Last-Modified" field or fall back to "Date". updated = ( response.headers.get("Last-Modified", response.headers.get("Date")) ).decode("ascii") il.add_value("updated", updated) il.add_css("title", "title::text") il.add_css("content_html", ".section") return il.load_item()
def parse_archive_search(self, response): articles = json.loads(response.text)["articles"]["hits"] for i, item in enumerate(articles): il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), timezone="Europe/Vienna", ) il.add_value("path", "magazine") link = response.urljoin(item["detail_link"]) il.add_value("link", link) try: author = re.sub( r"(?:.*:|Von)\s*(.*)", r"\1", ", ".join(item["authors"]).title() ) il.add_value("author_name", author) except IndexError: pass il.add_value("title", item["title"]) # All articles have the same date. # We add an offset so they are sorted in the right order. date = response.meta["issue_date"] + timedelta(seconds=i) il.add_value("updated", date) il.add_value("category", item["ressort"]) yield scrapy.Request(link, self.parse_item_text, meta={"il": il})
def parse_broadcast(self, response): broadcast = json.loads(response.text) il = FeedEntryItemLoader(response=response, timezone=self._timezone, dayfirst=False) link = 'https://{}/programm/{}/{}'.format(self.name, response.meta['oe1_day'], broadcast['programKey']) il.add_value('link', link) il.add_value('title', broadcast['programTitle']) il.add_value('title', broadcast['title']) if broadcast.get('streams'): stream = 'http://loopstream01.apa.at/?channel=oe1&id={}'.format( broadcast['streams'][0]['loopStreamId']) il.add_value('enclosure_iri', stream) il.add_value('enclosure_type', 'audio/mpeg') il.add_value('updated', broadcast['niceTimeISO']) if broadcast['subtitle']: il.add_value('content_html', '<strong>{}</strong>'.format(broadcast['subtitle'])) for item in broadcast['items']: if 'title' in item: il.add_value('content_html', '<h3>{}</h3>'.format(item['title'])) il.add_value('content_html', item.get('description')) il.add_value('content_html', broadcast['description']) yield il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( selector=response.xpath('//div[@id="maincontentbook"]'), base_url=self.feed_link, ) il.add_xpath("title", '//h1[@class="p_book_title"]/text()') il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()') il.add_value("link", response.url) il.add_value("author_name", self.feed_title) il.add_xpath("content_html", '//h1[@class="p_book_title"]/text()') il.add_xpath("content_html", '//h2[@class="p_book_author"]/text()') il.add_xpath("content_html", '//p[@class="p_book_publisher"]/text()') il.add_xpath("content_html", '//p[@class="p_book_isbn"]/text()') il.add_xpath("content_html", '(//span[@class="txt10px"])[1]/text()') il.add_xpath("content_html", '(//span[@class="txt10px"])[3]/text()') il.add_xpath("content_html", '//div[@class="bookcontent"]//text()') il.add_xpath("content_html", '//div[@class="p_book_image"]/img') il.add_xpath("content_html", '//span[@style="color:red;"]/b/text()') return il.load_item()
def _parse_article(self, response): def _fix_img_src(elem): src = elem.attrib.pop("data-zoom-src", None) # data-zoom-src is only valid if it starts with //images.derstandard.at. if src and src.startswith("//images.derstandard.at"): elem.attrib["src"] = src elem.attrib.pop("width", None) elem.attrib.pop("height", None) elem.attrib.pop("class", None) return elem remove_elems = [ ".credits", ".owner-info", ".image-zoom", ".continue", ".sequence-number", ".js-embed-output", "#mycountrytalks-embed", # Remove self-promotion for (other) ressorts. '.js-embed-output-feeds a[href^="/r"]', '.js-embed-output-feeds a[href^="https://derstandard.at/"]', ( ".js-embed-output-feeds " + 'img[src="https://images.derstandard.at/2018/10/18/' + 'Immobiliensuche202x122.png"]' ), ] change_tags = { "#media-list li .description": "figcaption", "#media-list li": "figure", "#media-list": "div", ".photo": "figure", ".caption": "figcaption", } replace_elems = { ".embedded-posting": "<p><em>Hinweis: Das eingebettete Posting ist nur " + "im Artikel verfügbar.</em></p>", # Replace every special script container with its unescaped content. "script.js-embed-template": lambda elem: ( '<div class="js-embed-output-feeds">' + html.unescape(elem.text or "") + "</div>" ), "img": _fix_img_src, } il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, change_tags=change_tags, replace_elems=replace_elems, ) il.add_value("link", response.url) il.add_css("title", 'meta[property="og:title"]::attr(content)') for author in response.css("span.author::text").extract(): # Sometimes the author name is messed up and written in upper case. # This happens usually for articles written by Günter Traxler. if author.upper() == author: author = author.title() il.add_value("author_name", author) il.add_value("path", response.meta["ressort"]) il.add_value("updated", response.meta["updated"]) il.add_css("category", "#breadcrumb .item a::text") blog_id = response.css("#userblogentry::attr(data-objectid)").extract_first() if blog_id: url = ( "https://{}/userprofil/bloggingdelivery/blogeintrag?godotid={}" ).format(self.name, blog_id) return scrapy.Request(url, self._parse_blog_article, meta={"il": il}) elif response.css("#feature-content"): cover_photo = response.css("#feature-cover-photo::attr(style)").re_first( r"\((.*)\)" ) il.add_value("content_html", '<img src="{}">'.format(cover_photo)) il.add_css("content_html", "#feature-cover-title h2") il.add_css("content_html", "#feature-content > .copytext") return il.load_item() else: il.add_css("content_html", "#content-aside") il.add_css("content_html", "#objectContent > .copytext") il.add_css("content_html", "#content-main > .copytext") il.add_css("content_html", ".slide") return il.load_item()
def parse_blog_article(self, response): remove_elems = [".ad-component", ".wp-caption-text"] il = FeedEntryItemLoader( response=response, remove_elems=remove_elems, base_url="https://cms.{}".format(self.name), timezone="Europe/Vienna", dayfirst=True, yearfirst=False, ) il.add_css("content_html", "article > h2") il.add_css("content_html", ".storycontent-article") il.add_css("author_name", ".falter-heading ::text", MapCompose(str.title)) il.add_css( "author_name", ".thinktank-meta > span ::text", MapCompose(str.title) ) il.add_css("updated", ".post > .text-label ::text", re=r"(\d{2}\.\d{2}\.\d{4})") il.add_value("link", response.url) il.add_value("path", "blog_{}".format(response.meta["blog"])) il.add_css("title", "article > h1 ::text") return il.load_item()
def _parse_episode(self, response): item = json.loads(response.text) il = FeedEntryItemLoader() il.add_value("title", item["title"]) il.add_value( "content_html", '<img src="{}">'.format(item["playlist"]["preview_image_url"]), ) if item["description"]: il.add_value("content_html", item["description"].replace("\r\n", "<br>")) il.add_value("updated", item["date"]) il.add_value("link", item["url"].replace("api-tvthek.orf.at", "tvthek.orf.at")) # Check how many segments are part of this episode. if len(item["_embedded"]["segments"]) == 1: # If only one segment, item["sources"] contains invalid links. # We use the first embedded segment instead. # This is also how mediathekviewweb.de works. item["sources"] = item["_embedded"]["segments"][0]["sources"] try: video = next(s for s in item["sources"]["progressive_download"] if s["quality_key"] == "Q8C") il.add_value("enclosure", { "iri": video["src"], "type": "video/mp4" }) except StopIteration: self.logger.warning("Could not extract video for '{}'!".format( item["title"])) raise DropResponse( f"Skipping {response.url} because not downloadable yet", transient=True, ) subtitle = item["_embedded"].get("subtitle") if subtitle: subtitle = subtitle["_embedded"]["srt_file"]["public_urls"][ "reference"] il.add_value("enclosure", { "iri": subtitle["url"], "type": "text/plain" }) else: self.logger.debug("No subtitle file found for '{}'".format( item["url"])) il.add_value( "category", self._categories_from_oewa_base_path( item["_embedded"]["profile"]["oewa_base_path"]), ) return il.load_item()
def parse_program(self, response): if not response.css(r".jsb_video\/FlashPlayer"): return data = json.loads( response.css(r".jsb_video\/FlashPlayer").xpath( "@data-jsb").extract()[0]) data = data["config"]["initial_video"]["parts"][0]["tracking"][ "nurago"] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), timezone="Europe/Vienna", dayfirst=True, ) il.add_value("link", data["clipurl"]) il.add_value("title", data["programname"]) il.add_value("updated", data["airdate"]) il.add_xpath("content_html", '//p[@class="plot_summary"]') item = il.load_item() # Only include videos posted in the last 7 days. if item["updated"] + self._timerange > datetime.now(timezone.utc): return item
def parse(self, response): feed = feedparser.parse(io.BytesIO(response.body)) if "entries" not in feed or not feed["entries"]: self.logger.error("Feed {} contains no entries!".format(response.url)) return feed_entries = feed["entries"] feed = feed["feed"] yield generate_feed_header( title=feed.get("title"), subtitle=feed.get("subtitle"), link=feed.get("link") or response.url, path=response.meta["path"], author_name=feed.get("author_detail", {}).get("name"), logo=feed.get("image", {}).get("href"), ) base_url = "://".join(urlparse(response.url)[:2]) for entry in feed_entries: # Deals with protocol-relative URLs. link = urljoin(base_url, entry["link"]) il = FeedEntryItemLoader(base_url=base_url) il.add_value("path", response.meta["path"]) il.add_value("updated", entry.get("updated") or entry.get("published")) il.add_value("author_name", entry.get("author_detail", {}).get("name")) il.add_value("link", link) il.add_value("category", [t["term"] for t in entry.get("tags", [])]) if response.meta["fulltext"]: il.add_value("title", entry["title"]) il.add_value("content_html", entry["content"][0]["value"]) yield il.load_item() else: # Content is not part of the feed, scrape it. yield scrapy.Request( link, self._parse_article, meta={"feed_entry": entry, "il": il} )
def parse(self, response): articles = json.loads(response.text) remove_elems = [ "hr + p", "hr", "iframe", "p i:last-of-type:contains('Facebook'):contains('Twitter')", ] for article in articles: il = FeedEntryItemLoader(timezone="UTC", remove_elems=remove_elems) il.add_value("title", article["title"]) il.add_value("link", article["url"]) if "thumbnail_url_1_1" in article: il.add_value( "content_html", '<img src="{}">'.format(article["thumbnail_url_1_1"]), ) il.add_value("content_html", article["body"]) il.add_value( "updated", datetime.utcfromtimestamp(article["publish_date"] / 1000) ) il.add_value( "author_name", [ contribution["contributor"]["full_name"] for contribution in article["contributions"] ], ) il.add_value("category", article["channel"]["name"]) for topic in article["topics"] + [article["primary_topic"]]: if topic and "name" in topic: il.add_value("category", topic["name"].title()) if article["nsfw"]: il.add_value("category", "nsfw") if article["nsfb"]: il.add_value("category", "nsfb") il.add_value("path", response.meta["locale"]) yield il.load_item()
def parse_item(self, response): il = FeedEntryItemLoader( response=response, base_url="{}/cms/".format(self.feed_link), timezone="Europe/Vienna", remove_elems=[".news-latest-date", ".news-single-rightbox", "hr", "h7"], remove_elems_xpath=[ '//div[@class="news-single-item"]/b[1]', '//div[@class="news-single-item"]/br[1]', ], dayfirst=True, ) il.add_value( "title", response.xpath("//head/title/text()").re_first(r"::: (.*)") ) il.add_value("link", response.url) il.add_value( "updated", response.xpath('//div[@class="news-single-rightbox"]').re_first( r"(\d{2}\.\d{2}\.\d{4})" ), ) il.add_value( "author_name", response.xpath('//head/meta[@name="publisher"]/@content').re_first( "recht.at, (.*);" ), ) il.add_xpath("author_name", '//head/meta[@name="author"]/@content') il.add_value("author_name", self.name) il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content') il.add_css("content_html", ".news-single-item h7 font strong") il.add_css("content_html", ".news-single-item") return il.load_item()
def _parse_user_profile(self, response): self._users[response.meta["user_id"]] = ( response.css("#up_user h2::text").extract_first().strip() ) for posting in response.css(".posting"): il = FeedEntryItemLoader( selector=posting, base_url="https://{}".format(self.name), change_tags={"span": "p"}, ) il.add_css("title", ".text strong::text") il.add_css("link", '.text a::attr("href")') il.add_value( "updated", datetime.utcfromtimestamp( int(posting.css('.date::attr("data-timestamp")').extract_first()) / 1000 ), ) il.add_css("content_html", ".text span") il.add_css("content_html", ".article h4") il.add_value("path", response.meta["path"]) yield il.load_item()
def _parse_item(self, response): remove_elems = [ "h1", ".nono", ".acceptance_org", ".state", "script", ".gentics-portletreload-position-notvisibleposition", ] remove_elems_xpath = [ """ //div[ @class='advice' and child::div[@class='advice_text' and ( contains(., 'nicht die aktuelle Rechtslage') or contains(., 'wird nicht laufend aktualisiert') or contains(., 'Übersicht über bisherige "Themen des Monats"') )] ] """, # Remove table of contents. "//li[child::a[starts-with(@href, '#')]]", "//ul[not(li)]", ] change_tags = {"abbr": "span"} il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", base_url="https://www.{}".format(self.name), remove_elems=remove_elems, remove_elems_xpath=remove_elems_xpath, change_tags=change_tags, dayfirst=True, ) il.add_value("link", response.url) il.add_xpath( "author_name", '//div[@class="acceptance_org"]/text()[preceding-sibling::br]', ) il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)") il.add_value( "updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})") ) il.add_css("content_html", ".Content") return il.load_item()
def _parse_article(self, response): # Heuristic for news.ORF.at to to detect teaser articles. more = self._extract_link( response.css( ".story-story p > strong:contains('Mehr') + a::attr(href), " + ".story-story p > a:contains('Lesen Sie mehr')::attr(href)" ).extract_first() ) if more and more != response.url: self.logger.debug("Detected teaser article, redirecting to {}".format(more)) response = yield scrapy.Request(more, meta=response.meta) remove_elems = [ ".byline", "h1", ".socialshare", ".socialShareWrapper", ".socialButtons", ".credit", ".toplink", ".offscreen", ".storyMeta", "script", ".oon-youtube-logo", ".vote", # redesign "#more-to-read-anchor", ".social-buttons", ".story-horizontal-ad", ".linkcard", ] pullup_elems = { ".remote .slideshow": 1, ".remote .instagram": 1, ".remote .facebook": 1, ".remote .twitter": 1, ".remote .youtube": 1, ".remote table": 1, } replace_elems = { ".video": "<p><em>Hinweis: Das eingebettete Video ist nur im Artikel " + "verfügbar.</em></p>" } change_attribs = {"img": {"data-src": "src", "srcset": "src"}} change_tags = { ".image": "figure", ".caption": "figcaption", ".fact": "blockquote", # FM4 } author, author_selector = self._extract_author(response) if author: self.logger.debug("Extracted possible author '{}'".format(author)) # Remove the paragraph that contains the author. remove_elems.insert(0, author_selector) else: self.logger.debug("Could not extract author name") author = "{}.ORF.at".format(response.meta["path"]) for slideshow in response.css(".slideshow"): link = response.urljoin( slideshow.css('::attr("data-slideshow-json-href")').extract_first() ).replace("jsonp", "json") slideshow_id = slideshow.css('::attr("id")').extract_first() slideshow_response = yield scrapy.Request(link) replace_elems["#{}".format(slideshow_id)] = self._create_slideshow_html( slideshow_response ) il = FeedEntryItemLoader( response=response, remove_elems=remove_elems, pullup_elems=pullup_elems, replace_elems=replace_elems, change_attribs=change_attribs, change_tags=change_tags, ) # The field is part of a JSON that is sometimes not valid, so don't bother with # parsing it properly. match = re.search(r'"datePublished": "([^"]+)"', response.text) if match: # news.ORF.at updated = match.group(1) else: # other updated = response.meta["updated"] il.add_value("updated", updated) il.add_css("title", "title::text", re=re.compile(r"(.*) - .*", flags=re.S)) il.add_value("link", response.url) il.add_css("content_html", ".opener img") # FM4, news il.add_css("content_html", ".story-lead-text") # news il.add_css("content_html", "#ss-storyText") il.add_css("content_html", "#ss-storyContent") # news il.add_value("author_name", author) if author in self._authors: il.add_value("path", author) il.add_value("path", response.meta["path"]) il.add_value("category", response.meta["categories"]) yield il.load_item()
def _parse_episode(self, response): item = json.loads(response.text) il = FeedEntryItemLoader() il.add_value("title", item["title"]) il.add_value( "content_html", '<img src="{}">'.format(item["playlist"]["preview_image_url"]), ) if item["description"]: il.add_value("content_html", item["description"].replace("\r\n", "<br>")) il.add_value("updated", item["date"]) il.add_value("link", item["url"].replace("api-tvthek.orf.at", "tvthek.orf.at")) # Check how many segments are part of this episode. if len(item["_embedded"]["segments"]) == 1: # If only one segment, item["sources"] contains invalid links. # We use the first embedded segment instead. # This is also how mediathekviewweb.de works. item["sources"] = item["_embedded"]["segments"][0]["sources"] try: video = next( s for s in item["sources"]["progressive_download"] if s["quality_key"] == "Q8C" ) il.add_value("enclosure", {"iri": video["src"], "type": "video/mp4"}) except StopIteration: self.logger.warning( "Could not extract video for '{}'!".format(item["title"]) ) raise DropResponse( "Skipping {} because not downloadable yet".format(response.url), transient=True, ) subtitle = item["_embedded"].get("subtitle") if subtitle: subtitle = subtitle["_embedded"]["srt_file"]["public_urls"]["reference"] il.add_value("enclosure", {"iri": subtitle["url"], "type": "text/plain"}) else: self.logger.debug("No subtitle file found for '{}'".format(item["url"])) il.add_value( "category", self._categories_from_oewa_base_path( item["_embedded"]["profile"]["oewa_base_path"] ), ) return il.load_item()
def _parse_article(self, response): remove_elems = [ ".caption-credit", ".gallery-image-credit", "#social-left", "ul.toc", "h3:contains('Table of Contents')", "br", ".sidebar:contains('Further Reading')", ".credit", ] change_tags = {".sidebar": "blockquote", "aside": "blockquote"} replace_elems = {"div.image": self._div_to_img} il = FeedEntryItemLoader( response=response, parent=response.meta["il"], remove_elems=remove_elems, replace_elems=replace_elems, change_tags=change_tags, ) if response.meta.get("first_page", False): il.add_value("link", response.url) il.add_css("author_name", ".byline a span ::text") il.add_css("content_html", "header h2") il.add_value("path", response.meta["path"]) il.add_css("content_html", ".article-content") if response.css(".next"): return scrapy.Request( response.css(".numbers a::attr(href)").extract()[-1], self._parse_article, meta={"il": il, "path": response.meta["path"]}, ) else: return il.load_item()
def parse(self, response): page = json.loads(response.text) yield generate_feed_header( title=page["name"], link=page["link"], path=response.meta["page_id"] ) for entry in page["posts"]["data"]: il = FeedEntryItemLoader() # updated_time also includes new comments not only updates to the # post. il.add_value("updated", entry["created_time"]) il.add_value( "link", "https://www.{name}/{user_id}/posts/{post_id}".format( name=self.name, **dict(zip(["user_id", "post_id"], entry["id"].split("_"))) ), ) message = entry.get("message") name = entry.get("name") link = entry.get("link") if message: message = message.splitlines() title = message[0] if len(title.split()) < 10 and not title.startswith("http"): # If the first line has less than ten words, it could be a # title. if title.upper() == title: title = title.title() del message[0] elif name and not name.startswith("http"): # Fallback to the name (of the link). title = name else: # Fallback to the first ten words of the message. title = " ".join(message[0].split(maxsplit=10)) + " ..." message = bleach.linkify("</p><p>".join(message)) il.add_value("content_html", "<p>{}</p>".format(message)) elif name: title = name else: title = link il.add_value("title", title) if link and name: il.add_value( "content_html", '<p><a href="{link}">{name}</a></p>'.format(link=link, name=name), ) picture = entry.get("picture") if picture: il.add_value( "content_html", '<a href="{link}"><img src="{image}"></a>'.format( link=link, image=picture ), ) il.add_value("path", response.meta["page_id"]) yield il.load_item()
def parse_item(self, response): remove_elems = [ "aside", "script", "h1", "source", ".breadcrumbs", ".author-date", ".artikel-social-kommentar", ".bild-copyright", ".ressortTitleMobile", ".article-number", ".artikel-kommentarlink", ".umfrage-wrapper", ".articleIssueInfo", ] il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name), remove_elems=remove_elems, ) il.add_value("link", response.url) author_name = ( response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)") or "Red." ) il.add_value("author_name", author_name) il.add_css("title", 'h1[itemprop="headline"]::text') il.add_value("updated", response.meta["updated"]) il.add_css("content_html", "article") return il.load_item()
def parse_lokalfuehrer(self, response): entries = json.loads(response.text)["hits"] for entry in entries: il = FeedEntryItemLoader( response=response, base_url="https://{}".format(self.name) ) il.add_value( "path", "lokalfuehrer_{}".format(response.meta["lokalfuehrer"]) ) il.add_value( "link", "https://www.{}/lokal/{}".format(self.name, entry["id"]) ) il.add_value("category", entry["categories"]) il.add_value("category", entry["zip"]) il.add_value("category", entry["city"]) review = entry.get("review") if review: il.add_value("title", review["post_title"]) il.add_value("title", review["post_subtitle"]) il.add_value("author_name", review["meta"].split("|")[0].title()) il.add_value("category", "review") il.add_value("updated", review["post_date"]) else: il.add_value("title", entry["name"]) for picture in entry["pictures"] or []: il.add_value( "content_html", '<img src="https://faltercdn2.falter.at/wwei/1080/{}">'.format( picture["filename"] ), ) if review: il.add_value("content_html", review["post_content"]) il.add_value("content_html", entry["category_text"]) il.add_value( "content_html", "<p>{} {}, {}</p>".format(entry["zip"], entry["city"], entry["street"]), ) if entry["location"]: il.add_value( "content_html", ( '<p><a href="https://www.google.com/maps?q={lat},{lon}">' + "Google Maps</a></p>" ).format(**entry["location"]), ) yield il.load_item()