コード例 #1
0
ファイル: konsument_at.py プロジェクト: Lukas0907/feeds
 def _parse_article(self, response):
     remove_elems = ["#issue", "h1", "#slogan", "#logo", "#footer"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_css("content_html", "#page")
     return il.load_item()
コード例 #2
0
ファイル: gnucash_org.py プロジェクト: Lukas0907/feeds
 def _parse_news(self, response):
     il = FeedEntryItemLoader(response=response, parent=response.meta["il"])
     il.add_xpath(
         "content_html",
         '//div[@class="newsheader" and .//a[@id="{}"]]'
         '/following-sibling::div[@class="newsinner"]'.format(
             response.meta["news_id"]
         ),
     )
     return il.load_item()
コード例 #3
0
ファイル: delinski_at.py プロジェクト: Lukas0907/feeds
 def parse(self, response):
     m = re.search("window.DELINSKI, {listViewEntities: (.*)}", response.text)
     restaurants = sorted(
         json.loads(m.group(1))["restaurants"]["entities"].values(),
         key=lambda r: int(r["created"]),
         reverse=True,
     )
     for restaurant in restaurants[:20]:
         il = FeedEntryItemLoader(timezone="UTC", base_url=response.url)
         url = response.urljoin(restaurant["url"])
         il.add_value("link", url)
         il.add_value("title", restaurant["name"])
         content = """
         <img src="{image}">
         <ul>
             <li>{address}</li>
             <li>{price_range_human}</li>
             <li>{cuisine_text}</li>
         </ul>
         """
         il.add_value("content_html", content.format(**restaurant))
         il.add_value(
             "updated", datetime.utcfromtimestamp(int(restaurant["created"]))
         )
         yield scrapy.Request(url, self._parse_restaurant, meta={"il": il})
コード例 #4
0
ファイル: wienerlinien_at.py プロジェクト: Lukas0907/feeds
 def parse_item(self, response):
     remove_elems = ["h1", ".delayed-image-load"]
     change_tags = {"noscript": "div"}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         change_tags=change_tags,
         base_url="https://www.{}".format(self.name),
     )
     il.add_xpath("content_html", '//div[@id="main-inner"]')
     return il.load_item()
コード例 #5
0
ファイル: diepresse_com.py プロジェクト: Lukas0907/feeds
 def parse_node(self, response, node):
     url = node.xpath("rss:loc/text()").extract_first()
     il = FeedEntryItemLoader(selector=node)
     il.add_value("link", url)
     il.add_xpath("title", "news:news/news:title/text()")
     keywords = node.xpath("news:news/news:keywords/text()").extract_first()
     if keywords:
         il.add_value("category", keywords.split(", "))
     il.add_xpath("updated", "news:news/news:publication_date/text()")
     return scrapy.Request(
         url, self.parse_item, meta={"il": il, "handle_httpstatus_list": [404]}
     )
コード例 #6
0
ファイル: uebermedien_de.py プロジェクト: Lukas0907/feeds
 def _parse_article(self, response):
     remove_elems = ["script"]
     convert_footnotes = [".footnoteContent"]
     pullup_elems = {".footnoteContent": 1}
     change_tags = {".entry-content-info-box": "blockquote"}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         change_tags=change_tags,
         base_url="https://{}".format(self.name),
         convert_footnotes=convert_footnotes,
         pullup_elems=pullup_elems,
     )
     il.add_css("content_html", ".entry-content")
     return il.load_item()
コード例 #7
0
ファイル: delinski_at.py プロジェクト: Lukas0907/feeds
 def _parse_restaurant(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url=response.url,
         parent=response.meta["il"],
         remove_elems=[".external"],
     )
     il.add_css("content_html", ".content .right p")
     il.add_css("content_html", ".restaurant-link")
     il.add_css("category", ".tags a ::text")
     yield il.load_item()
コード例 #8
0
ファイル: zeitdiebin_at.py プロジェクト: Lukas0907/feeds
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url="{}/".format(self.feed_link),
         timezone="Europe/Vienna",
         dayfirst=True,
         remove_elems=[".ruler", "h1"],
     )
     il.add_css("title", "h1.event-title::text")
     il.add_value("link", response.url)
     il.add_css("content_html", "div#content.container")
     return il.load_item()
コード例 #9
0
ファイル: tuwien_ac_at.py プロジェクト: Lukas0907/feeds
    def parse(self, response):
        mitteilungsblaetter = response.css(".mitteilungsblaetter")
        updated = mitteilungsblaetter.css("::text").re_first(r"(\d{2}\.\d{2}\.\d{4})")
        link = response.urljoin(
            mitteilungsblaetter.css('a::attr("href")').extract_first()
        )

        response = yield scrapy.Request(link, method="HEAD")
        mb_url = response.url
        match = re.search(
            r"https://tiss.tuwien.ac.at/mbl/blatt_struktur/anzeigen/(\d+)", mb_url
        )
        if not match:
            self.logger.error("No Mitteilungsblätter found!")
            return
        else:
            mb_id = match.group(1)

        url = "https://tiss.{}/api/mbl/v22/id/{}".format(self.name, mb_id)
        response = yield scrapy.Request(url)

        last_entry = None
        for entry in reversed(json.loads(response.text)["knoten"]):
            (entry["main"], entry["sub"]) = re.match(
                r"(\d+)\.?(\d*)", entry["counter"]
            ).groups()
            if last_entry is not None and last_entry["main"] == entry["main"]:
                entry["inhalt"] += "<h2>{}</h2>".format(last_entry["titel"])
                entry["inhalt"] += last_entry["inhalt"]
            if entry["sub"] == "":
                il = FeedEntryItemLoader(
                    base_url="https://tiss.{}".format(self.name),
                    timezone="Europe/Vienna",
                    dayfirst=True,
                )
                il.add_value("updated", updated)
                il.add_value("link", mb_url + "#{}".format(entry["counter"]))
                il.add_value("title", entry["titel"])
                il.add_value("content_html", entry["inhalt"])
                yield il.load_item()
                last_entry = None
            else:
                last_entry = entry
コード例 #10
0
ファイル: atv_at.py プロジェクト: Lukas0907/feeds
 def parse_program(self, response):
     if not response.css(r".jsb_video\/FlashPlayer"):
         return
     data = json.loads(
         response.css(r".jsb_video\/FlashPlayer").xpath("@data-jsb").extract()[0]
     )
     data = data["config"]["initial_video"]["parts"][0]["tracking"]["nurago"]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", data["clipurl"])
     il.add_value("title", data["programname"])
     il.add_value("updated", data["airdate"])
     il.add_xpath("content_html", '//p[@class="plot_summary"]')
     item = il.load_item()
     # Only include videos posted in the last 7 days.
     if item["updated"] + self._timerange > datetime.now(timezone.utc):
         return item
コード例 #11
0
ファイル: openwrt_org.py プロジェクト: Lukas0907/feeds
 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(
         response=response, parent=response.meta["il"], base_url=self._base_url
     )
     il.add_value("content_html", "<h1>Detailed Changelog</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return il.load_item()
コード例 #12
0
ファイル: arstechnica_com.py プロジェクト: Lukas0907/feeds
 def parse_node(self, response, node):
     link = node.xpath("link/text()").extract_first()
     il = FeedEntryItemLoader()
     il.add_value("title", node.xpath("title/text()").extract_first())
     il.add_value("updated", node.xpath("pubDate/text()").extract_first())
     il.add_value("category", node.xpath("category/text()").extract())
     return scrapy.Request(
         link,
         self._parse_article,
         cookies={"view": "mobile"},
         meta={"il": il, "path": response.meta["path"], "first_page": True},
     )
コード例 #13
0
ファイル: wienerlinien_at.py プロジェクト: Lukas0907/feeds
 def parse(self, response):
     # Wiener Linien returns HTML with an XML content type which creates an
     # XmlResponse.
     response = HtmlResponse(url=response.url, body=response.body)
     for item in response.css(".block-news-item"):
         il = FeedEntryItemLoader(
             response=response,
             timezone="Europe/Vienna",
             ignoretz=True,
             base_url="https://www.{}".format(self.name),
         )
         link = response.urljoin(item.css("a::attr(href)").extract_first())
         il.add_value("link", link)
         il.add_value("title", item.css("h3::text").extract_first())
         il.add_value("updated", item.css(".date::text").extract_first())
         yield scrapy.Request(link, self.parse_item, meta={"il": il})
コード例 #14
0
ファイル: falter_at.py プロジェクト: Lukas0907/feeds
 def parse_item_text(self, response):
     remove_elems = [".dachzeile", "h1", ".meta", "br", "form", ".button-container"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         base_url="https://{}".format(self.name),
     )
     content = response.xpath("//article").extract_first()
     if "Lesen Sie diesen Artikel in voller Länge" in content:
         il.add_value("category", "paywalled")
     il.add_value("content_html", content)
     return il.load_item()
コード例 #15
0
ファイル: generic.py プロジェクト: Lukas0907/feeds
 def _parse_article(self, response):
     feed_entry = response.meta["feed_entry"]
     il = FeedEntryItemLoader(parent=response.meta["il"])
     doc = Document(response.text, url=response.url)
     il.add_value("title", doc.short_title() or feed_entry.get("title"))
     summary = feed_entry.get("summary")
     try:
         content = doc.summary(html_partial=True)
         if summary and len(summary) > len(content):
             # Something probably went wrong if the extracted content is shorter than
             # the summary.
             raise Unparseable
     except Unparseable:
         content = summary
     il.add_value("content_html", content)
     return il.load_item()
コード例 #16
0
ファイル: falter_at.py プロジェクト: Lukas0907/feeds
 def parse_lokalfuehrer(self, response):
     entries = json.loads(response.text)[0]["hits"]
     for entry in entries:
         il = FeedEntryItemLoader(
             response=response, base_url="https://{}".format(self.name)
         )
         il.add_value(
             "path", "lokalfuehrer_{}".format(response.meta["lokalfuehrer"])
         )
         il.add_value(
             "link", "https://www.{}/lokal/{}".format(self.name, entry["id"])
         )
         il.add_value("category", entry["categories"])
         il.add_value("category", entry["zip"])
         il.add_value("category", entry["city"])
         review = entry.get("review")
         if review:
             il.add_value("title", review["post_title"])
             il.add_value("title", review["post_subtitle"])
             il.add_value("author_name", review["meta"].split("|")[0].title())
             il.add_value("category", "review")
             il.add_value("updated", review["post_date"])
         else:
             il.add_value("title", entry["name"])
         if "pictures" in entry and entry["pictures"]:
             il.add_value(
                 "content_html",
                 '<img src="https://fcc.at/ef/img720/{}">'.format(
                     entry["pictures"][0]["filename"]
                 ),
             )
         if review:
             il.add_value("content_html", review["post_content"])
         il.add_value("content_html", entry["category_text"])
         il.add_value(
             "content_html",
             "<p>{} {}, {}</p>".format(entry["zip"], entry["city"], entry["street"]),
         )
         if entry["location"]:
             il.add_value(
                 "content_html",
                 (
                     '<p><a href="https://www.google.com/maps?q={lat},{lon}">'
                     + "Google Maps</a></p>"
                 ).format(**entry["location"]),
             )
         yield il.load_item()
コード例 #17
0
ファイル: falter_at.py プロジェクト: Lukas0907/feeds
 def parse_archive_search(self, response):
     for i, item in enumerate(json.loads(response.text)["result"]["hits"]):
         il = FeedEntryItemLoader(
             response=response,
             base_url="https://{}".format(self.name),
             timezone="Europe/Vienna",
         )
         il.add_value("path", "magazine")
         link = response.urljoin(item["detail_link"])
         il.add_value("link", link)
         try:
             author = re.sub(
                 r"(?:.*:|Von)\s*(.*)", r"\1", ", ".join(item["authors"]).title()
             )
             il.add_value("author_name", author)
         except IndexError:
             pass
         il.add_value("title", item["title"])
         # All articles have the same date.
         # We add an offset so they are sorted in the right order.
         date = response.meta["issue_date"] + timedelta(seconds=i)
         il.add_value("updated", date)
         yield scrapy.Request(link, self.parse_item_text, meta={"il": il})
コード例 #18
0
ファイル: biblioweb_at.py プロジェクト: Lukas0907/feeds
    def parse_content(self, response):
        parts = self._extract_parts(response)
        il = FeedEntryItemLoader(
            response=response, timezone="Europe/Vienna", dayfirst=True
        )
        il.add_value("path", self._library)
        il.add_value("title", " - ".join(parts[: self._find_first_meta(parts)]))
        il.add_value("link", response.url)
        il.add_xpath("updated", "//td/span/text()", re="In der Bibliothek seit: (.*)")

        _content = ["<ul>"]
        for part in parts:
            _content.append("<li>{}</li>".format(part))
        _content.append("</ul>")
        il.add_value("content_html", "".join(_content))
        return il.load_item()
コード例 #19
0
 def _parse_article(self, response):
     remove_elems = ["h1", "#contents", ".headerlink"]
     change_tags = {".admonition-title": "h2"}
     il = FeedEntryItemLoader(
         response=response,
         base_url=response.url,
         remove_elems=remove_elems,
         change_tags=change_tags,
     )
     il.add_value("link", response.url)
     il.add_value("author_name", "Brandon Rhodes")
     # Use "Last-Modified" field or fall back to "Date".
     updated = (
         response.headers.get("Last-Modified", response.headers.get("Date"))
     ).decode("ascii")
     il.add_value("updated", updated)
     il.add_css("title", "title::text")
     il.add_css("content_html", ".section")
     return il.load_item()
コード例 #20
0
ファイル: falter_at.py プロジェクト: marcelogp/PyFeeds
 def parse_archive_search(self, response):
     articles = json.loads(response.text)["articles"]["hits"]
     for i, item in enumerate(articles):
         il = FeedEntryItemLoader(
             response=response,
             base_url="https://{}".format(self.name),
             timezone="Europe/Vienna",
         )
         il.add_value("path", "magazine")
         link = response.urljoin(item["detail_link"])
         il.add_value("link", link)
         try:
             author = re.sub(
                 r"(?:.*:|Von)\s*(.*)", r"\1", ", ".join(item["authors"]).title()
             )
             il.add_value("author_name", author)
         except IndexError:
             pass
         il.add_value("title", item["title"])
         # All articles have the same date.
         # We add an offset so they are sorted in the right order.
         date = response.meta["issue_date"] + timedelta(seconds=i)
         il.add_value("updated", date)
         il.add_value("category", item["ressort"])
         yield scrapy.Request(link, self.parse_item_text, meta={"il": il})
コード例 #21
0
 def parse_broadcast(self, response):
     broadcast = json.loads(response.text)
     il = FeedEntryItemLoader(response=response,
                              timezone=self._timezone,
                              dayfirst=False)
     link = 'https://{}/programm/{}/{}'.format(self.name,
                                               response.meta['oe1_day'],
                                               broadcast['programKey'])
     il.add_value('link', link)
     il.add_value('title', broadcast['programTitle'])
     il.add_value('title', broadcast['title'])
     if broadcast.get('streams'):
         stream = 'http://loopstream01.apa.at/?channel=oe1&id={}'.format(
             broadcast['streams'][0]['loopStreamId'])
         il.add_value('enclosure_iri', stream)
         il.add_value('enclosure_type', 'audio/mpeg')
     il.add_value('updated', broadcast['niceTimeISO'])
     if broadcast['subtitle']:
         il.add_value('content_html',
                      '<strong>{}</strong>'.format(broadcast['subtitle']))
     for item in broadcast['items']:
         if 'title' in item:
             il.add_value('content_html',
                          '<h3>{}</h3>'.format(item['title']))
         il.add_value('content_html', item.get('description'))
     il.add_value('content_html', broadcast['description'])
     yield il.load_item()
コード例 #22
0
ファイル: ak_ciando_com.py プロジェクト: Lukas0907/feeds
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@id="maincontentbook"]'),
         base_url=self.feed_link,
     )
     il.add_xpath("title", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()')
     il.add_value("link", response.url)
     il.add_value("author_name", self.feed_title)
     il.add_xpath("content_html", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("content_html", '//h2[@class="p_book_author"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_publisher"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_isbn"]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[1]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[3]/text()')
     il.add_xpath("content_html", '//div[@class="bookcontent"]//text()')
     il.add_xpath("content_html", '//div[@class="p_book_image"]/img')
     il.add_xpath("content_html", '//span[@style="color:red;"]/b/text()')
     return il.load_item()
コード例 #23
0
ファイル: derstandard_at.py プロジェクト: Lukas0907/feeds
    def _parse_article(self, response):
        def _fix_img_src(elem):
            src = elem.attrib.pop("data-zoom-src", None)
            # data-zoom-src is only valid if it starts with //images.derstandard.at.
            if src and src.startswith("//images.derstandard.at"):
                elem.attrib["src"] = src
            elem.attrib.pop("width", None)
            elem.attrib.pop("height", None)
            elem.attrib.pop("class", None)
            return elem

        remove_elems = [
            ".credits",
            ".owner-info",
            ".image-zoom",
            ".continue",
            ".sequence-number",
            ".js-embed-output",
            "#mycountrytalks-embed",
            # Remove self-promotion for (other) ressorts.
            '.js-embed-output-feeds a[href^="/r"]',
            '.js-embed-output-feeds a[href^="https://derstandard.at/"]',
            (
                ".js-embed-output-feeds "
                + 'img[src="https://images.derstandard.at/2018/10/18/'
                + 'Immobiliensuche202x122.png"]'
            ),
        ]
        change_tags = {
            "#media-list li .description": "figcaption",
            "#media-list li": "figure",
            "#media-list": "div",
            ".photo": "figure",
            ".caption": "figcaption",
        }
        replace_elems = {
            ".embedded-posting": "<p><em>Hinweis: Das eingebettete Posting ist nur "
            + "im Artikel verfügbar.</em></p>",
            # Replace every special script container with its unescaped content.
            "script.js-embed-template": lambda elem: (
                '<div class="js-embed-output-feeds">'
                + html.unescape(elem.text or "")
                + "</div>"
            ),
            "img": _fix_img_src,
        }
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
            replace_elems=replace_elems,
        )
        il.add_value("link", response.url)
        il.add_css("title", 'meta[property="og:title"]::attr(content)')
        for author in response.css("span.author::text").extract():
            # Sometimes the author name is messed up and written in upper case.
            # This happens usually for articles written by Günter Traxler.
            if author.upper() == author:
                author = author.title()
            il.add_value("author_name", author)
        il.add_value("path", response.meta["ressort"])
        il.add_value("updated", response.meta["updated"])
        il.add_css("category", "#breadcrumb .item a::text")
        blog_id = response.css("#userblogentry::attr(data-objectid)").extract_first()
        if blog_id:
            url = (
                "https://{}/userprofil/bloggingdelivery/blogeintrag?godotid={}"
            ).format(self.name, blog_id)
            return scrapy.Request(url, self._parse_blog_article, meta={"il": il})
        elif response.css("#feature-content"):
            cover_photo = response.css("#feature-cover-photo::attr(style)").re_first(
                r"\((.*)\)"
            )
            il.add_value("content_html", '<img src="{}">'.format(cover_photo))
            il.add_css("content_html", "#feature-cover-title h2")
            il.add_css("content_html", "#feature-content > .copytext")
            return il.load_item()
        else:
            il.add_css("content_html", "#content-aside")
            il.add_css("content_html", "#objectContent > .copytext")
            il.add_css("content_html", "#content-main > .copytext")
            il.add_css("content_html", ".slide")
            return il.load_item()
コード例 #24
0
ファイル: falter_at.py プロジェクト: marcelogp/PyFeeds
 def parse_blog_article(self, response):
     remove_elems = [".ad-component", ".wp-caption-text"]
     il = FeedEntryItemLoader(
         response=response,
         remove_elems=remove_elems,
         base_url="https://cms.{}".format(self.name),
         timezone="Europe/Vienna",
         dayfirst=True,
         yearfirst=False,
     )
     il.add_css("content_html", "article > h2")
     il.add_css("content_html", ".storycontent-article")
     il.add_css("author_name", ".falter-heading ::text", MapCompose(str.title))
     il.add_css(
         "author_name", ".thinktank-meta > span ::text", MapCompose(str.title)
     )
     il.add_css("updated", ".post > .text-label ::text", re=r"(\d{2}\.\d{2}\.\d{4})")
     il.add_value("link", response.url)
     il.add_value("path", "blog_{}".format(response.meta["blog"]))
     il.add_css("title", "article > h1 ::text")
     return il.load_item()
コード例 #25
0
    def _parse_episode(self, response):
        item = json.loads(response.text)
        il = FeedEntryItemLoader()
        il.add_value("title", item["title"])
        il.add_value(
            "content_html",
            '<img src="{}">'.format(item["playlist"]["preview_image_url"]),
        )
        if item["description"]:
            il.add_value("content_html",
                         item["description"].replace("\r\n", "<br>"))
        il.add_value("updated", item["date"])
        il.add_value("link", item["url"].replace("api-tvthek.orf.at",
                                                 "tvthek.orf.at"))
        # Check how many segments are part of this episode.
        if len(item["_embedded"]["segments"]) == 1:
            # If only one segment, item["sources"] contains invalid links.
            # We use the first embedded segment instead.
            # This is also how mediathekviewweb.de works.
            item["sources"] = item["_embedded"]["segments"][0]["sources"]
        try:
            video = next(s for s in item["sources"]["progressive_download"]
                         if s["quality_key"] == "Q8C")
            il.add_value("enclosure", {
                "iri": video["src"],
                "type": "video/mp4"
            })
        except StopIteration:
            self.logger.warning("Could not extract video for '{}'!".format(
                item["title"]))
            raise DropResponse(
                f"Skipping {response.url} because not downloadable yet",
                transient=True,
            )

        subtitle = item["_embedded"].get("subtitle")
        if subtitle:
            subtitle = subtitle["_embedded"]["srt_file"]["public_urls"][
                "reference"]
            il.add_value("enclosure", {
                "iri": subtitle["url"],
                "type": "text/plain"
            })
        else:
            self.logger.debug("No subtitle file found for '{}'".format(
                item["url"]))
        il.add_value(
            "category",
            self._categories_from_oewa_base_path(
                item["_embedded"]["profile"]["oewa_base_path"]),
        )
        return il.load_item()
コード例 #26
0
ファイル: atv_at.py プロジェクト: marcelogp/PyFeeds
 def parse_program(self, response):
     if not response.css(r".jsb_video\/FlashPlayer"):
         return
     data = json.loads(
         response.css(r".jsb_video\/FlashPlayer").xpath(
             "@data-jsb").extract()[0])
     data = data["config"]["initial_video"]["parts"][0]["tracking"][
         "nurago"]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", data["clipurl"])
     il.add_value("title", data["programname"])
     il.add_value("updated", data["airdate"])
     il.add_xpath("content_html", '//p[@class="plot_summary"]')
     item = il.load_item()
     # Only include videos posted in the last 7 days.
     if item["updated"] + self._timerange > datetime.now(timezone.utc):
         return item
コード例 #27
0
ファイル: generic.py プロジェクト: Lukas0907/feeds
 def parse(self, response):
     feed = feedparser.parse(io.BytesIO(response.body))
     if "entries" not in feed or not feed["entries"]:
         self.logger.error("Feed {} contains no entries!".format(response.url))
         return
     feed_entries = feed["entries"]
     feed = feed["feed"]
     yield generate_feed_header(
         title=feed.get("title"),
         subtitle=feed.get("subtitle"),
         link=feed.get("link") or response.url,
         path=response.meta["path"],
         author_name=feed.get("author_detail", {}).get("name"),
         logo=feed.get("image", {}).get("href"),
     )
     base_url = "://".join(urlparse(response.url)[:2])
     for entry in feed_entries:
         # Deals with protocol-relative URLs.
         link = urljoin(base_url, entry["link"])
         il = FeedEntryItemLoader(base_url=base_url)
         il.add_value("path", response.meta["path"])
         il.add_value("updated", entry.get("updated") or entry.get("published"))
         il.add_value("author_name", entry.get("author_detail", {}).get("name"))
         il.add_value("link", link)
         il.add_value("category", [t["term"] for t in entry.get("tags", [])])
         if response.meta["fulltext"]:
             il.add_value("title", entry["title"])
             il.add_value("content_html", entry["content"][0]["value"])
             yield il.load_item()
         else:
             # Content is not part of the feed, scrape it.
             yield scrapy.Request(
                 link, self._parse_article, meta={"feed_entry": entry, "il": il}
             )
コード例 #28
0
ファイル: vice_com.py プロジェクト: Lukas0907/feeds
 def parse(self, response):
     articles = json.loads(response.text)
     remove_elems = [
         "hr + p",
         "hr",
         "iframe",
         "p i:last-of-type:contains('Facebook'):contains('Twitter')",
     ]
     for article in articles:
         il = FeedEntryItemLoader(timezone="UTC", remove_elems=remove_elems)
         il.add_value("title", article["title"])
         il.add_value("link", article["url"])
         if "thumbnail_url_1_1" in article:
             il.add_value(
                 "content_html",
                 '<img src="{}">'.format(article["thumbnail_url_1_1"]),
             )
         il.add_value("content_html", article["body"])
         il.add_value(
             "updated", datetime.utcfromtimestamp(article["publish_date"] / 1000)
         )
         il.add_value(
             "author_name",
             [
                 contribution["contributor"]["full_name"]
                 for contribution in article["contributions"]
             ],
         )
         il.add_value("category", article["channel"]["name"])
         for topic in article["topics"] + [article["primary_topic"]]:
             if topic and "name" in topic:
                 il.add_value("category", topic["name"].title())
         if article["nsfw"]:
             il.add_value("category", "nsfw")
         if article["nsfb"]:
             il.add_value("category", "nsfb")
         il.add_value("path", response.meta["locale"])
         yield il.load_item()
コード例 #29
0
    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url="{}/cms/".format(self.feed_link),
            timezone="Europe/Vienna",
            remove_elems=[".news-latest-date", ".news-single-rightbox", "hr", "h7"],
            remove_elems_xpath=[
                '//div[@class="news-single-item"]/b[1]',
                '//div[@class="news-single-item"]/br[1]',
            ],
            dayfirst=True,
        )

        il.add_value(
            "title", response.xpath("//head/title/text()").re_first(r"::: (.*)")
        )

        il.add_value("link", response.url)

        il.add_value(
            "updated",
            response.xpath('//div[@class="news-single-rightbox"]').re_first(
                r"(\d{2}\.\d{2}\.\d{4})"
            ),
        )

        il.add_value(
            "author_name",
            response.xpath('//head/meta[@name="publisher"]/@content').re_first(
                "recht.at, (.*);"
            ),
        )
        il.add_xpath("author_name", '//head/meta[@name="author"]/@content')
        il.add_value("author_name", self.name)

        il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content')

        il.add_css("content_html", ".news-single-item h7 font strong")
        il.add_css("content_html", ".news-single-item")

        return il.load_item()
コード例 #30
0
ファイル: ak_ciando_com.py プロジェクト: rodarima/PyFeeds
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@id="maincontentbook"]'),
         base_url=self.feed_link,
     )
     il.add_xpath("title", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()')
     il.add_value("link", response.url)
     il.add_value("author_name", self.feed_title)
     il.add_xpath("content_html", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("content_html", '//h2[@class="p_book_author"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_publisher"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_isbn"]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[1]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[3]/text()')
     il.add_xpath("content_html", '//div[@class="bookcontent"]//text()')
     il.add_xpath("content_html", '//div[@class="p_book_image"]/img')
     il.add_xpath("content_html", '//span[@style="color:red;"]/b/text()')
     return il.load_item()
コード例 #31
0
ファイル: derstandard_at.py プロジェクト: Lukas0907/feeds
 def _parse_user_profile(self, response):
     self._users[response.meta["user_id"]] = (
         response.css("#up_user h2::text").extract_first().strip()
     )
     for posting in response.css(".posting"):
         il = FeedEntryItemLoader(
             selector=posting,
             base_url="https://{}".format(self.name),
             change_tags={"span": "p"},
         )
         il.add_css("title", ".text strong::text")
         il.add_css("link", '.text a::attr("href")')
         il.add_value(
             "updated",
             datetime.utcfromtimestamp(
                 int(posting.css('.date::attr("data-timestamp")').extract_first())
                 / 1000
             ),
         )
         il.add_css("content_html", ".text span")
         il.add_css("content_html", ".article h4")
         il.add_value("path", response.meta["path"])
         yield il.load_item()
コード例 #32
0
ファイル: help_gv_at.py プロジェクト: Lukas0907/feeds
 def _parse_item(self, response):
     remove_elems = [
         "h1",
         ".nono",
         ".acceptance_org",
         ".state",
         "script",
         ".gentics-portletreload-position-notvisibleposition",
     ]
     remove_elems_xpath = [
         """
         //div[
             @class='advice' and
             child::div[@class='advice_text' and (
                 contains(., 'nicht die aktuelle Rechtslage') or
                 contains(., 'wird nicht laufend aktualisiert') or
                 contains(., 'Übersicht über bisherige "Themen des Monats"')
             )]
         ]
         """,
         # Remove table of contents.
         "//li[child::a[starts-with(@href, '#')]]",
         "//ul[not(li)]",
     ]
     change_tags = {"abbr": "span"}
     il = FeedEntryItemLoader(
         response=response,
         timezone="Europe/Vienna",
         base_url="https://www.{}".format(self.name),
         remove_elems=remove_elems,
         remove_elems_xpath=remove_elems_xpath,
         change_tags=change_tags,
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "author_name",
         '//div[@class="acceptance_org"]/text()[preceding-sibling::br]',
     )
     il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)")
     il.add_value(
         "updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})")
     )
     il.add_css("content_html", ".Content")
     return il.load_item()
コード例 #33
0
ファイル: orf_at.py プロジェクト: Lukas0907/feeds
    def _parse_article(self, response):
        # Heuristic for news.ORF.at to to detect teaser articles.
        more = self._extract_link(
            response.css(
                ".story-story p > strong:contains('Mehr') + a::attr(href), "
                + ".story-story p > a:contains('Lesen Sie mehr')::attr(href)"
            ).extract_first()
        )
        if more and more != response.url:
            self.logger.debug("Detected teaser article, redirecting to {}".format(more))
            response = yield scrapy.Request(more, meta=response.meta)

        remove_elems = [
            ".byline",
            "h1",
            ".socialshare",
            ".socialShareWrapper",
            ".socialButtons",
            ".credit",
            ".toplink",
            ".offscreen",
            ".storyMeta",
            "script",
            ".oon-youtube-logo",
            ".vote",
            # redesign
            "#more-to-read-anchor",
            ".social-buttons",
            ".story-horizontal-ad",
            ".linkcard",
        ]
        pullup_elems = {
            ".remote .slideshow": 1,
            ".remote .instagram": 1,
            ".remote .facebook": 1,
            ".remote .twitter": 1,
            ".remote .youtube": 1,
            ".remote table": 1,
        }
        replace_elems = {
            ".video": "<p><em>Hinweis: Das eingebettete Video ist nur im Artikel "
            + "verfügbar.</em></p>"
        }
        change_attribs = {"img": {"data-src": "src", "srcset": "src"}}
        change_tags = {
            ".image": "figure",
            ".caption": "figcaption",
            ".fact": "blockquote",  # FM4
        }
        author, author_selector = self._extract_author(response)
        if author:
            self.logger.debug("Extracted possible author '{}'".format(author))
            # Remove the paragraph that contains the author.
            remove_elems.insert(0, author_selector)
        else:
            self.logger.debug("Could not extract author name")
            author = "{}.ORF.at".format(response.meta["path"])

        for slideshow in response.css(".slideshow"):
            link = response.urljoin(
                slideshow.css('::attr("data-slideshow-json-href")').extract_first()
            ).replace("jsonp", "json")
            slideshow_id = slideshow.css('::attr("id")').extract_first()
            slideshow_response = yield scrapy.Request(link)
            replace_elems["#{}".format(slideshow_id)] = self._create_slideshow_html(
                slideshow_response
            )

        il = FeedEntryItemLoader(
            response=response,
            remove_elems=remove_elems,
            pullup_elems=pullup_elems,
            replace_elems=replace_elems,
            change_attribs=change_attribs,
            change_tags=change_tags,
        )

        # The field is part of a JSON that is sometimes not valid, so don't bother with
        # parsing it properly.
        match = re.search(r'"datePublished": "([^"]+)"', response.text)
        if match:
            # news.ORF.at
            updated = match.group(1)
        else:
            # other
            updated = response.meta["updated"]
        il.add_value("updated", updated)
        il.add_css("title", "title::text", re=re.compile(r"(.*) - .*", flags=re.S))
        il.add_value("link", response.url)
        il.add_css("content_html", ".opener img")  # FM4, news
        il.add_css("content_html", ".story-lead-text")  # news
        il.add_css("content_html", "#ss-storyText")
        il.add_css("content_html", "#ss-storyContent")  # news
        il.add_value("author_name", author)
        if author in self._authors:
            il.add_value("path", author)
        il.add_value("path", response.meta["path"])
        il.add_value("category", response.meta["categories"])
        yield il.load_item()
コード例 #34
0
ファイル: tvthek_orf_at.py プロジェクト: Lukas0907/feeds
    def _parse_episode(self, response):
        item = json.loads(response.text)
        il = FeedEntryItemLoader()
        il.add_value("title", item["title"])
        il.add_value(
            "content_html",
            '<img src="{}">'.format(item["playlist"]["preview_image_url"]),
        )
        if item["description"]:
            il.add_value("content_html", item["description"].replace("\r\n", "<br>"))
        il.add_value("updated", item["date"])
        il.add_value("link", item["url"].replace("api-tvthek.orf.at", "tvthek.orf.at"))
        # Check how many segments are part of this episode.
        if len(item["_embedded"]["segments"]) == 1:
            # If only one segment, item["sources"] contains invalid links.
            # We use the first embedded segment instead.
            # This is also how mediathekviewweb.de works.
            item["sources"] = item["_embedded"]["segments"][0]["sources"]
        try:
            video = next(
                s
                for s in item["sources"]["progressive_download"]
                if s["quality_key"] == "Q8C"
            )
            il.add_value("enclosure", {"iri": video["src"], "type": "video/mp4"})
        except StopIteration:
            self.logger.warning(
                "Could not extract video for '{}'!".format(item["title"])
            )
            raise DropResponse(
                "Skipping {} because not downloadable yet".format(response.url),
                transient=True,
            )

        subtitle = item["_embedded"].get("subtitle")
        if subtitle:
            subtitle = subtitle["_embedded"]["srt_file"]["public_urls"]["reference"]
            il.add_value("enclosure", {"iri": subtitle["url"], "type": "text/plain"})
        else:
            self.logger.debug("No subtitle file found for '{}'".format(item["url"]))
        il.add_value(
            "category",
            self._categories_from_oewa_base_path(
                item["_embedded"]["profile"]["oewa_base_path"]
            ),
        )
        return il.load_item()
コード例 #35
0
ファイル: arstechnica_com.py プロジェクト: Lukas0907/feeds
 def _parse_article(self, response):
     remove_elems = [
         ".caption-credit",
         ".gallery-image-credit",
         "#social-left",
         "ul.toc",
         "h3:contains('Table of Contents')",
         "br",
         ".sidebar:contains('Further Reading')",
         ".credit",
     ]
     change_tags = {".sidebar": "blockquote", "aside": "blockquote"}
     replace_elems = {"div.image": self._div_to_img}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         replace_elems=replace_elems,
         change_tags=change_tags,
     )
     if response.meta.get("first_page", False):
         il.add_value("link", response.url)
         il.add_css("author_name", ".byline a span ::text")
         il.add_css("content_html", "header h2")
         il.add_value("path", response.meta["path"])
     il.add_css("content_html", ".article-content")
     if response.css(".next"):
         return scrapy.Request(
             response.css(".numbers a::attr(href)").extract()[-1],
             self._parse_article,
             meta={"il": il, "path": response.meta["path"]},
         )
     else:
         return il.load_item()
コード例 #36
0
ファイル: facebook_com.py プロジェクト: Lukas0907/feeds
 def parse(self, response):
     page = json.loads(response.text)
     yield generate_feed_header(
         title=page["name"], link=page["link"], path=response.meta["page_id"]
     )
     for entry in page["posts"]["data"]:
         il = FeedEntryItemLoader()
         # updated_time also includes new comments not only updates to the
         # post.
         il.add_value("updated", entry["created_time"])
         il.add_value(
             "link",
             "https://www.{name}/{user_id}/posts/{post_id}".format(
                 name=self.name,
                 **dict(zip(["user_id", "post_id"], entry["id"].split("_")))
             ),
         )
         message = entry.get("message")
         name = entry.get("name")
         link = entry.get("link")
         if message:
             message = message.splitlines()
             title = message[0]
             if len(title.split()) < 10 and not title.startswith("http"):
                 # If the first line has less than ten words, it could be a
                 # title.
                 if title.upper() == title:
                     title = title.title()
                 del message[0]
             elif name and not name.startswith("http"):
                 # Fallback to the name (of the link).
                 title = name
             else:
                 # Fallback to the first ten words of the message.
                 title = " ".join(message[0].split(maxsplit=10)) + " ..."
             message = bleach.linkify("</p><p>".join(message))
             il.add_value("content_html", "<p>{}</p>".format(message))
         elif name:
             title = name
         else:
             title = link
         il.add_value("title", title)
         if link and name:
             il.add_value(
                 "content_html",
                 '<p><a href="{link}">{name}</a></p>'.format(link=link, name=name),
             )
         picture = entry.get("picture")
         if picture:
             il.add_value(
                 "content_html",
                 '<a href="{link}"><img src="{image}"></a>'.format(
                     link=link, image=picture
                 ),
             )
         il.add_value("path", response.meta["page_id"])
         yield il.load_item()
コード例 #37
0
ファイル: profil_at.py プロジェクト: Lukas0907/feeds
 def parse_item(self, response):
     remove_elems = [
         "aside",
         "script",
         "h1",
         "source",
         ".breadcrumbs",
         ".author-date",
         ".artikel-social-kommentar",
         ".bild-copyright",
         ".ressortTitleMobile",
         ".article-number",
         ".artikel-kommentarlink",
         ".umfrage-wrapper",
         ".articleIssueInfo",
     ]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_value("link", response.url)
     author_name = (
         response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)") or "Red."
     )
     il.add_value("author_name", author_name)
     il.add_css("title", 'h1[itemprop="headline"]::text')
     il.add_value("updated", response.meta["updated"])
     il.add_css("content_html", "article")
     return il.load_item()
コード例 #38
0
ファイル: falter_at.py プロジェクト: marcelogp/PyFeeds
 def parse_lokalfuehrer(self, response):
     entries = json.loads(response.text)["hits"]
     for entry in entries:
         il = FeedEntryItemLoader(
             response=response, base_url="https://{}".format(self.name)
         )
         il.add_value(
             "path", "lokalfuehrer_{}".format(response.meta["lokalfuehrer"])
         )
         il.add_value(
             "link", "https://www.{}/lokal/{}".format(self.name, entry["id"])
         )
         il.add_value("category", entry["categories"])
         il.add_value("category", entry["zip"])
         il.add_value("category", entry["city"])
         review = entry.get("review")
         if review:
             il.add_value("title", review["post_title"])
             il.add_value("title", review["post_subtitle"])
             il.add_value("author_name", review["meta"].split("|")[0].title())
             il.add_value("category", "review")
             il.add_value("updated", review["post_date"])
         else:
             il.add_value("title", entry["name"])
         for picture in entry["pictures"] or []:
             il.add_value(
                 "content_html",
                 '<img src="https://faltercdn2.falter.at/wwei/1080/{}">'.format(
                     picture["filename"]
                 ),
             )
         if review:
             il.add_value("content_html", review["post_content"])
         il.add_value("content_html", entry["category_text"])
         il.add_value(
             "content_html",
             "<p>{} {}, {}</p>".format(entry["zip"], entry["city"], entry["street"]),
         )
         if entry["location"]:
             il.add_value(
                 "content_html",
                 (
                     '<p><a href="https://www.google.com/maps?q={lat},{lon}">'
                     + "Google Maps</a></p>"
                 ).format(**entry["location"]),
             )
         yield il.load_item()