Ejemplo n.º 1
0
    def _parse_article(self, response):
        feed_entry = response.meta["feed_entry"]

        il = FeedEntryItemLoader(parent=response.meta["il"])
        try:
            response.text
        except AttributeError:
            # Response is not text (e.g. PDF, ...).
            il.add_value("title", feed_entry.get("title"))
            il.add_value("content_html", feed_entry.get("summary"))
            return il.load_item()

        doc = Document(response.text, url=response.url)
        il.add_value("title", doc.short_title() or feed_entry.get("title"))
        summary = feed_entry.get("summary")
        try:
            content = doc.summary(html_partial=True)
            if summary and len(summary) > len(content):
                # Something probably went wrong if the extracted content is shorter than
                # the summary.
                raise Unparseable
        except Unparseable:
            content = summary
        il.add_value("content_html", content)

        return il.load_item()
Ejemplo n.º 2
0
 def parse_item(self, response):
     remove_elems = [
         "aside",
         "script",
         "h1",
         "source",
         ".breadcrumbs",
         ".author-date",
         ".artikel-social-kommentar",
         ".bild-copyright",
         ".ressortTitleMobile",
         ".article-number",
         ".artikel-kommentarlink",
         ".umfrage-wrapper",
         ".articleIssueInfo",
     ]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_value("link", response.url)
     author_name = (
         response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)")
         or "Red.")
     il.add_value("author_name", author_name)
     il.add_css("title", 'h1[itemprop="headline"]::text')
     il.add_value("updated", response.meta["updated"])
     il.add_css("content_html", "article")
     return il.load_item()
Ejemplo n.º 3
0
 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(
         response=response, parent=response.meta["il"], base_url=self._base_url
     )
     il.add_value("content_html", "<h1>Detailed Changelog</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return il.load_item()
Ejemplo n.º 4
0
 def parse_item(self, response):
     remove_elems = [
         "aside",
         "script",
         "h1",
         "source",
         ".breadcrumbs",
         ".author-date",
         ".artikel-social-kommentar",
         ".bild-copyright",
         ".ressortTitleMobile",
         ".article-number",
         ".artikel-kommentarlink",
         ".umfrage-wrapper",
         ".articleIssueInfo",
     ]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_value("link", response.url)
     author_name = (
         response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)") or "Red."
     )
     il.add_value("author_name", author_name)
     il.add_css("title", 'h1[itemprop="headline"]::text')
     il.add_value("updated", response.meta["updated"])
     il.add_css("content_html", "article")
     return il.load_item()
Ejemplo n.º 5
0
    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url='{}/cms/'.format(self._link),
            timezone=self._timezone,
            remove_elems=['.news-latest-date', '.news-single-rightbox', 'hr',
                          'h7'],
            remove_elems_xpath=['//div[@class="news-single-item"]/b[1]',
                                '//div[@class="news-single-item"]/br[1]'],
        )

        il.add_value(
            'title',
            response.xpath('//head/title/text()').re_first(r'::: (.*)'))

        il.add_value('link', response.url)

        il.add_value(
            'updated',
            response.xpath('//div[@class="news-single-rightbox"]').
            re_first(r'(\d{2}\.\d{2}\.\d{4})'))

        il.add_value(
            'author_name',
            response.xpath('//head/meta[@name="publisher"]/@content').
            re_first('recht.at, (.*);'))
        il.add_xpath('author_name', '//head/meta[@name="author"]/@content')
        il.add_value('author_name', self.name)

        il.add_xpath('author_email', '//head/meta[@name="reply-to"]/@content')

        il.add_css('content_html', '.news-single-item h7 font strong')
        il.add_css('content_html', '.news-single-item')

        yield il.load_item()
Ejemplo n.º 6
0
    def parse_item(self, response):
        author_date = " ".join(response.css(".author-date ::text").extract())
        match = re.search(r"von\s+(.*)", author_date)
        author_name = match.group(1) if match else "Red."

        remove_elems = [
            "aside",
            "script",
            "h1",
            "source",
            ".breadcrumbs",
            ".author-date",
            ".artikel-social-kommentar",
            ".bild-copyright",
            ".ressortTitleMobile",
            ".article-number",
            ".artikel-kommentarlink",
            ".umfrage-wrapper",
            ".articleIssueInfo",
            "hr",
            "center div[style='padding: 10px; background:#efefef']",
        ]
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
        )
        il.add_value("link", response.url)
        il.add_value("author_name", author_name)
        il.add_css("title", 'h1[itemprop="headline"]::text')
        il.add_value("updated", response.meta["updated"])
        il.add_css("content_html", "article")
        return il.load_item()
Ejemplo n.º 7
0
 def parse(self, response):
     articles = json.loads(response.text)
     for article in articles:
         il = FeedEntryItemLoader()
         il.add_value('title', article['title'])
         il.add_value('link', article['url'])
         if 'thumbnail_url_1_1' in article:
             il.add_value(
                 'content_html',
                 '<img src="{}">'.format(article['thumbnail_url_1_1']))
         il.add_value('content_html', article['body'])
         il.add_value('updated',
                      delorean.epoch(article['publish_date'] / 1000))
         il.add_value('author_name', [
             contribution['contributor']['full_name']
             for contribution in article['contributions']
         ])
         il.add_value('category', article['channel']['name'])
         for topic in article['topics'] + [article['primary_topic']]:
             if topic and 'name' in topic:
                 il.add_value('category', topic['name'].title())
         if article['nsfw']:
             il.add_value('category', 'nsfw')
         if article['nsfb']:
             il.add_value('category', 'nsfb')
         il.add_value('path', response.meta['locale'])
         yield il.load_item()
Ejemplo n.º 8
0
 def parse_movies(self, response):
     entries = json.loads(response.text)["hits"]
     for entry in entries:
         il = FeedEntryItemLoader(response=response,
                                  base_url="https://{}".format(self.name))
         il.add_value("path", "{}".format(response.meta["movies"]))
         il.add_value(
             "link",
             "https://www.{}/kino/{}".format(self.name, entry["prod_id"]))
         il.add_value("title", entry["prod"])
         il.add_value("content_html", entry["comment"])
         for image in entry["images"] or []:
             il.add_value(
                 "content_html",
                 '<img src="https://faltercdn2.falter.at/events/1080/{}">'.
                 format(image["filename"]),
             )
         if "stream" in entry:
             il.add_value("content_html",
                          '<a href="{s}">{s}</a>'.format(s=entry["stream"]))
         for key, value in entry.items():
             if key.startswith("has_") and value:
                 il.add_value("category", key.replace("has_", ""))
             elif key.startswith("is_") and value:
                 il.add_value("category", key.replace("is_", ""))
         il.add_value("updated", entry["index_date"])
         yield il.load_item()
Ejemplo n.º 9
0
 def parse_broadcast(self, response):
     broadcast = json.loads(response.text)
     il = FeedEntryItemLoader(response=response,
                              timezone=self._timezone,
                              dayfirst=False)
     link = 'https://{}/programm/{}/{}'.format(self.name,
                                               response.meta['oe1_day'],
                                               broadcast['programKey'])
     il.add_value('link', link)
     il.add_value('title', broadcast['programTitle'])
     il.add_value('title', broadcast['title'])
     if broadcast.get('streams'):
         stream = 'http://loopstream01.apa.at/?channel=oe1&id={}'.format(
             broadcast['streams'][0]['loopStreamId'])
         il.add_value('enclosure_iri', stream)
         il.add_value('enclosure_type', 'audio/mpeg')
     il.add_value('updated', broadcast['niceTimeISO'])
     if broadcast['subtitle']:
         il.add_value('content_html',
                      '<strong>{}</strong>'.format(broadcast['subtitle']))
     for item in broadcast['items']:
         if 'title' in item:
             il.add_value('content_html',
                          '<h3>{}</h3>'.format(item['title']))
         il.add_value('content_html', item.get('description'))
     il.add_value('content_html', broadcast['description'])
     yield il.load_item()
Ejemplo n.º 10
0
 def _parse_article(self, response):
     remove_elems = [
         ".caption-credit",
         ".gallery-image-credit",
         "#social-left",
         "ul.toc",
         "h3:contains('Table of Contents')",
         "br",
         ".sidebar:contains('Further Reading')",
         ".credit",
     ]
     change_tags = {".sidebar": "blockquote", "aside": "blockquote"}
     replace_elems = {"div.image": self._div_to_img}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         replace_elems=replace_elems,
         change_tags=change_tags,
     )
     if response.meta.get("first_page", False):
         il.add_value("link", response.url)
         il.add_css("author_name", ".byline a span ::text")
         il.add_css("content_html", "header h2")
         il.add_value("path", response.meta["path"])
     il.add_css("content_html", ".article-content")
     if response.css(".next"):
         return scrapy.Request(
             response.css(".numbers a::attr(href)").extract()[-1],
             self._parse_article,
             meta={"il": il, "path": response.meta["path"]},
         )
     else:
         return il.load_item()
Ejemplo n.º 11
0
 def parse_node(self, response, node):
     il = FeedEntryItemLoader(response=response,
                              base_url=f"https://{self.name}")
     updated = dateutil_parse(node.xpath("dc:date/text()").extract_first())
     il.add_value("updated", updated)
     title = node.xpath("rss:title/text()").extract_first()
     paywalled = title.startswith("[$]")
     if paywalled:
         title = title.replace("[$] ", "")
         il.add_value("category", "paywalled")
     link = node.xpath("rss:link/text()").extract_first()
     link = link.replace("rss", "")
     link = link.replace("http://", "https://")
     meta = {"il": il}
     if paywalled and not self._subscribed:
         il.add_value("title", title)
         il.add_value("author_name",
                      node.xpath("dc:creator/text()").extract_first())
         il.add_value("content_text",
                      node.xpath("rss:description/text()").extract_first())
         il.add_value("link", link)
         return il.load_item()
     else:
         if "LWN.net Weekly Edition for" in title:
             meta["updated"] = updated
             callback = self._parse_weekly_edition
             link += "bigpage"
         else:
             callback = self._parse_article
         # Don't include link yet, we will use the subscriber link later.
         # So subscriber articles can be shared from the feed reader and
         # read in browser without logging in.
         return scrapy.Request(link, callback, meta=meta)
Ejemplo n.º 12
0
    def parse_album(self, response):
        def _replace_track_info(elem):
            parts = list(
                map(lambda x: x.text_content().strip(), elem.getchildren()))
            return '<p>{} <i>({})</i></p>'.format(parts[0], parts[1])

        title = response.xpath('//h1[@class="c-product-block__title"]//text()'
                               ).extract()[-1].strip()
        artist = response.xpath(
            '//div[contains(@class,"c-product-block__contributors")]/p/text()'
        ).re_first('[^,]+')
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}/".format(self.name),
            remove_elems=[
                '.c-product-block__title', '.c-product__product-purchase',
                '.c-track__format-specific-info', '.c-track__duration',
                '.c-track__details', '.c-tracklist__initial-tracks',
                '.c-tabs-block__tabs-links', 'button'
            ],
            replace_elems={'.c-track__all-format-info': _replace_track_info})
        il.add_value("title", '{} - {}'.format(artist, title))
        il.add_value("link", response.url)
        il.add_value("author_name", 'bot')
        il.add_css("content_html", 'div.c-page--product')
        return il.load_item()
Ejemplo n.º 13
0
    def _parse_article_url(self, response):
        if 'Fehler' in response.css('h2 ::text').extract_first():
            self.logger.info('Skipping {} as it returned an error'.format(
                response.url))
            return

        remove_elems = ['div[style="padding-top:10px;"]']
        il = FeedEntryItemLoader(response=response,
                                 timezone=self._timezone,
                                 base_url='http://{}'.format(self.name),
                                 dayfirst=True,
                                 remove_elems=remove_elems)
        il.add_value('link', response.url)
        il.add_value('author_name', 'VKI')
        date = response.css('.issue').re_first(
            'veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})')
        il.add_value('updated', date)
        url = (response.xpath('//a[text()="Druckversion"]/@onclick').re_first(
            r"window\.open\('(.*)'\);"))
        il.add_css('title', 'h1::text')
        if url:
            yield scrapy.Request(response.urljoin(url),
                                 callback=self._parse_article,
                                 meta={'il': il})
        else:
            il.add_value('category', 'paywalled')
            il.add_css('content_html', '.primary')
            il.add_css('content_html', 'div[style="padding-top:10px;"] > h3')
            yield il.load_item()
Ejemplo n.º 14
0
 def parse(self, response):
     feed = feedparser.parse(io.BytesIO(response.body))
     if "entries" not in feed or not feed["entries"]:
         self.logger.error("Feed {} contains no entries!".format(response.url))
         return
     feed_entries = feed["entries"]
     feed = feed["feed"]
     yield generate_feed_header(
         title=feed.get("title"),
         subtitle=feed.get("subtitle"),
         link=feed.get("link") or response.url,
         path=response.meta["path"],
         author_name=feed.get("author_detail", {}).get("name"),
         logo=feed.get("image", {}).get("href"),
     )
     base_url = "://".join(urlparse(response.url)[:2])
     for entry in feed_entries:
         # Deals with protocol-relative URLs.
         link = urljoin(base_url, entry["link"])
         il = FeedEntryItemLoader(base_url=base_url)
         il.add_value("path", response.meta["path"])
         il.add_value("updated", entry.get("updated") or entry.get("published"))
         il.add_value("author_name", entry.get("author_detail", {}).get("name"))
         il.add_value("link", link)
         il.add_value("category", [t["term"] for t in entry.get("tags", [])])
         if response.meta["fulltext"]:
             il.add_value("title", entry["title"])
             il.add_value("content_html", entry["content"][0]["value"])
             yield il.load_item()
         else:
             # Content is not part of the feed, scrape it.
             yield scrapy.Request(
                 link, self._parse_article, meta={"feed_entry": entry, "il": il}
             )
Ejemplo n.º 15
0
 def _parse_episode(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "title",
         '//meta[@name="title"]/@content',
         re=r"(?s)(.*?)(?: vom .*)? - puls4\.com",
     )
     il.add_value(
         "updated",
         "{} {}".format(
             response.xpath('//meta[@name="title"]/@content').re_first(
                 r".*vom (\d{2}\.\d{2}\.\d{4}).*"
             ),
             response.meta["time"] or "00:00",
         ),
     )
     il.add_value(
         "content_html",
         '<img src="{}">'.format(
             response.xpath('//meta[@property="og:image"]/@content').extract_first()
         ),
     )
     il.add_css("content_html", ".player-video-description-intro::text")
     return il.load_item()
Ejemplo n.º 16
0
 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta["il"],
                              base_url=self._base_url)
     il.add_value("content_html", "<h1>Detailed Changelog</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return il.load_item()
Ejemplo n.º 17
0
    def _parse_article(self, response):
        title = response.css('meta[property="og:title"]::attr(content)').extract_first()
        if not title:
            raise DropResponse(
                "Skipping {} because ran into bot detection".format(response.url),
                transient=True,
            )

        remove_elems = [
            "meta",
            ".ds-share-list",
            ".advert",
            ".layout-article-links",
            ".ds-chapter-list",
            ".layout-article-meta",
        ]
        change_tags = {
            ".article__lead-image": "figure",
            ".article__description": "h2",
            ".article__footnote": "i",
        }
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
        )
        il.add_value("link", response.url)
        il.add_value("title", title)
        il.add_css("updated", "time.article__dateline-datetime::attr('datetime')")
        il.add_css("content_html", ".article__lead-image")
        il.add_css("content_html", ".article__description")
        il.add_css("content_html", ".layout-article-body")
        il.add_value("path", response.meta["ressort"])
        return il.load_item()
Ejemplo n.º 18
0
    def _parse_article(self, response):
        def _fix_img_src(elem):
            if "data-original" in elem.attrib:
                elem.attrib["src"] = elem.attrib["data-original"]
            return elem

        remove_elems = [
            ".credit",
            ".hide-caption",
            ".toggle-caption",
            ".enlarge-options",
            ".enlarge_measure",
            ".enlarge_html",
            ".ad-backstage",
            'p:first-of-type:contains("Editor\'s Note: This is an excerpt of")',
            'p:contains("Did you enjoy this newsletter segment?")',
        ]
        replace_elems = {"img": _fix_img_src}
        change_tags = {".image": "figure", ".credit-caption": "figcaption"}

        il = FeedEntryItemLoader(
            response=response,
            base_url=self._base_url,
            remove_elems=remove_elems,
            replace_elems=replace_elems,
            change_tags=change_tags,
        )
        il.add_css("title", "h1 ::text")
        il.add_value("link", response.url)
        il.add_css("content_html", "#storytext")
        il.add_value("path", response.meta["path"])
        il.add_css("updated", '.dateblock time::attr("datetime")')
        il.add_css("author_name", ".byline__name a::text")

        yield il.load_item()
Ejemplo n.º 19
0
    def _parse_article(self, response):
        if response.status == 410:
            # Articles has been deleted.
            return

        remove_elems = [".bildtext .author", "iframe"]
        change_tags = {"h1": "h2", ".bildbox": "figure", ".bildtext": "figcaption"}
        il = FeedEntryItemLoader(
            response=response,
            timezone="Europe/Vienna",
            base_url="https://www.{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
            dayfirst=True,
            yearfirst=False,
        )
        if response.css(".payment"):
            il.add_value("category", "paywalled")
        il.add_css("link", 'link[rel="canonical"]::attr(href)')
        il.add_css("title", 'meta[property="og:title"]::attr(content)')
        il.add_css("author_name", ".druckheadline::text", re=r"·\s*(.*)\s*·")
        # Mon, 01 Oct 18 13:42:45 +0200
        il.add_css("updated", 'meta[http-equiv="last-modified"]::attr(content)')
        il.add_css("content_html", ".druckcontent")
        il.add_value("path", response.meta["ressort"])
        return il.load_item()
Ejemplo n.º 20
0
 def _parse_episode(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url=f"https://{self.name}",
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "title",
         '//meta[@name="title"]/@content',
         re=r"(?s)(.*?)(?: vom .*)? - puls4\.com",
     )
     il.add_value(
         "updated",
         "{} {}".format(
             response.xpath('//meta[@name="title"]/@content').re_first(
                 r".*vom (\d{2}\.\d{2}\.\d{4}).*"),
             response.meta["time"] or "00:00",
         ),
     )
     il.add_value(
         "content_html",
         '<img src="{}">'.format(
             response.xpath(
                 '//meta[@property="og:image"]/@content').extract_first()),
     )
     il.add_css("content_html", ".player-video-description-intro::text")
     return il.load_item()
Ejemplo n.º 21
0
 def parse_item(self, response):
     remove_elems = [
         'aside',
         'script',
         'h1',
         '.breadcrumbs',
         '.author-date',
         '.artikel-social-kommentar',
         '.bild-copyright',
         '.ressortTitleMobile',
         '.article-number',
         '.artikel-kommentarlink',
         '.umfrage-wrapper',
         '.articleIssueInfo',
     ]
     il = FeedEntryItemLoader(response=response,
                              timezone=self._timezone,
                              base_url='http://{}'.format(self.name),
                              remove_elems=remove_elems)
     il.add_value('link', response.url)
     author_name = (
         response.css('.author-date ::text').re(r'(?:Von)?\s*(\w+ \w+)')
         or 'Red.')
     il.add_value('author_name', author_name)
     il.add_css('title', 'h1[itemprop="headline"]::text')
     il.add_css('updated',
                'meta[property="article:published_time"]::attr(content)',
                re='([^+]*)')
     il.add_css('content_html', 'article')
     yield il.load_item()
Ejemplo n.º 22
0
    def _parse_article(self, response):
        if response.status == 410:
            # Articles has been deleted.
            return

        remove_elems = [
            '.bildtext .author', 'iframe',
        ]
        change_tags = {
            'h1': 'h2'
        }
        il = FeedEntryItemLoader(response=response,
                                 timezone=self._timezone,
                                 base_url='https://www.{}'.format(self.name),
                                 remove_elems=remove_elems,
                                 change_tags=change_tags,
                                 dayfirst=False,
                                 yearfirst=False)
        if response.css('.payment'):
            il.add_value('category', 'paywalled')
        il.add_css('link', 'link[rel="canonical"]::attr(href)')
        il.add_css('title', 'meta[property="og:title"]::attr(content)')
        il.add_css('author_name', '.druckheadline::text',
                   re='·\s*(.*)\s*·')
        il.add_css('updated',
                   'meta[http-equiv="last-modified"]::attr(content)')
        il.add_css('content_html', '.druckcontent')
        il.add_value('path', response.meta['ressort'])
        yield il.load_item()
Ejemplo n.º 23
0
 def _parse_article(self, response):
     remove_elems = ['#issue', 'h1', '#slogan', '#logo', '#footer']
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta['il'],
                              base_url='http://{}'.format(self.name),
                              remove_elems=remove_elems)
     il.add_css('content_html', '#page')
     yield il.load_item()
Ejemplo n.º 24
0
 def _parse_article(self, response):
     remove_elems = ['iframe', 'script']
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta['il'],
                              remove_elems=remove_elems,
                              base_url='http://{}'.format(self.name))
     il.add_css('content_html', '.entry-content')
     return il.load_item()
Ejemplo n.º 25
0
 def parse(self, response):
     page = json.loads(response.text)
     yield generate_feed_header(title=page["name"],
                                link=page["link"],
                                path=response.meta["page_id"])
     for entry in page["posts"]["data"]:
         il = FeedEntryItemLoader()
         # updated_time also includes new comments not only updates to the
         # post.
         il.add_value("updated", entry["created_time"])
         il.add_value(
             "link",
             "https://www.{name}/{user_id}/posts/{post_id}".format(
                 name=self.name,
                 **dict(zip(["user_id", "post_id"],
                            entry["id"].split("_")))),
         )
         message = entry.get("message")
         name = entry.get("name")
         link = entry.get("link")
         if message:
             message = message.splitlines()
             title = message[0]
             if len(title.split()) < 10 and not title.startswith("http"):
                 # If the first line has less than ten words, it could be a
                 # title.
                 if title.upper() == title:
                     title = title.title()
                 del message[0]
             elif name and not name.startswith("http"):
                 # Fallback to the name (of the link).
                 title = name
             else:
                 # Fallback to the first ten words of the message.
                 title = " ".join(message[0].split(maxsplit=10)) + " ..."
             message = bleach.linkify("</p><p>".join(message))
             il.add_value("content_html", "<p>{}</p>".format(message))
         elif name:
             title = name
         else:
             title = link
         il.add_value("title", title)
         if link and name:
             il.add_value(
                 "content_html",
                 '<p><a href="{link}">{name}</a></p>'.format(link=link,
                                                             name=name),
             )
         picture = entry.get("picture")
         if picture:
             il.add_value(
                 "content_html",
                 '<a href="{link}"><img src="{image}"></a>'.format(
                     link=link, image=picture),
             )
         il.add_value("path", response.meta["page_id"])
         yield il.load_item()
Ejemplo n.º 26
0
 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta['il'],
         base_url=self._base_url,
     )
     il.add_value('content_html', '<h1>Detailed Changelog</h1>')
     il.add_xpath('content_html', '//h1/following-sibling::*')
     yield il.load_item()
Ejemplo n.º 27
0
 def parse(self, response):
     page = json.loads(response.text)
     yield generate_feed_header(
         title=page["name"], link=page["link"], path=response.meta["page_id"]
     )
     for entry in page["posts"]["data"]:
         il = FeedEntryItemLoader()
         # updated_time also includes new comments not only updates to the
         # post.
         il.add_value("updated", entry["created_time"])
         il.add_value(
             "link",
             "https://www.{name}/{user_id}/posts/{post_id}".format(
                 name=self.name,
                 **dict(zip(["user_id", "post_id"], entry["id"].split("_")))
             ),
         )
         message = entry.get("message")
         name = entry.get("name")
         link = entry.get("link")
         if message:
             message = message.splitlines()
             title = message[0]
             if len(title.split()) < 10 and not title.startswith("http"):
                 # If the first line has less than ten words, it could be a
                 # title.
                 if title.upper() == title:
                     title = title.title()
                 del message[0]
             elif name and not name.startswith("http"):
                 # Fallback to the name (of the link).
                 title = name
             else:
                 # Fallback to the first ten words of the message.
                 title = " ".join(message[0].split(maxsplit=10)) + " ..."
             message = bleach.linkify("</p><p>".join(message))
             il.add_value("content_html", "<p>{}</p>".format(message))
         elif name:
             title = name
         else:
             title = link
         il.add_value("title", title)
         if link and name:
             il.add_value(
                 "content_html",
                 '<p><a href="{link}">{name}</a></p>'.format(link=link, name=name),
             )
         picture = entry.get("picture")
         if picture:
             il.add_value(
                 "content_html",
                 '<a href="{link}"><img src="{image}"></a>'.format(
                     link=link, image=picture
                 ),
             )
         il.add_value("path", response.meta["page_id"])
         yield il.load_item()
Ejemplo n.º 28
0
 def _parse_news(self, response):
     il = FeedEntryItemLoader(response=response, parent=response.meta["il"])
     il.add_xpath(
         "content_html",
         '//div[@class="newsheader" and .//a[@id="{}"]]'
         '/following-sibling::div[@class="newsinner"]'.format(
             response.meta["news_id"]
         ),
     )
     return il.load_item()
Ejemplo n.º 29
0
 def _parse_article(self, response):
     remove_elems = ["#issue", "h1", "#slogan", "#logo", "#footer"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_css("content_html", "#page")
     return il.load_item()
Ejemplo n.º 30
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@class="main"]'), timezone="Europe/Vienna"
     )
     il.add_xpath("title", "h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("content_html", "h1/following-sibling::*")
     il.add_value("updated", response.url.rstrip("/").split("/")[-1].split("_")[0])
     il.add_value("author_name", self.name)
     return il.load_item()
Ejemplo n.º 31
0
 def _parse_article(self, response):
     remove_elems = ["#issue", "h1", "#slogan", "#logo", "#footer"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         base_url=f"https://{self.name}",
         remove_elems=remove_elems,
     )
     il.add_css("content_html", "#page")
     return il.load_item()
Ejemplo n.º 32
0
 def parse_item(self, response):
     remove_elems = ['h1', '.delayed-image-load']
     change_tags = {'noscript': 'div'}
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta['il'],
                              remove_elems=remove_elems,
                              change_tags=change_tags,
                              base_url='http://{}'.format(self.name))
     il.add_xpath('content_html', '//div[@id="main-inner"]')
     yield il.load_item()
Ejemplo n.º 33
0
 def parse_letter(self, response):
     account = response.meta["account"]
     il = FeedEntryItemLoader(response=response, base_url=self._links.get(account))
     il.add_value("path", account)
     il.add_value("link", response.url)
     il.add_css("title", "title::text")
     il.add_css("author_name", "div#message-heading div.by-line a::text")
     il.add_css("updated", "div#message-heading div.date::text")
     il.add_css("content_html", "div.message-body")
     yield il.load_item()
Ejemplo n.º 34
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@class="main"]'),
         timezone="Europe/Vienna")
     il.add_xpath("title", "h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("content_html", "h1/following-sibling::*")
     il.add_value("updated",
                  response.url.rstrip("/").split("/")[-1].split("_")[0])
     il.add_value("author_name", self.name)
     return il.load_item()
Ejemplo n.º 35
0
    def _parse_article(self, response):
        def _fix_img_src(elem):
            if "src" not in elem.attrib:
                if "data-lazy-src" in elem.attrib:
                    elem.attrib["src"] = elem.attrib["data-lazy-src"]
                elif "data-src" in elem.attrib:
                    elem.attrib["src"] = elem.attrib["data-src"]
            return elem

        def _parse_breadcrumbs(breadcrumbs):
            links = breadcrumbs.css("a::text, a::attr('href')").extract()
            return {k[1:]: v for k, v in zip(links[::2], links[1::2])}

        breadcrumbs = _parse_breadcrumbs(
            response.css(".site-contextnavigation-breadcrumbs-nav a")
        )
        self._titles = {**self._titles, **breadcrumbs}

        remove_elems = [
            "ad-container",
            "figure > footer",
            "picture > button",
            "div[data-section-type='newsletter']",
            ".gallery-summary",
        ]
        change_tags = {
            ".article-subtitle": "strong",
            "aside": "blockquote",
            "p strong:only-child": "h3",
        }
        replace_elems = {"img": _fix_img_src}
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
            replace_elems=replace_elems,
            timezone="Europe/Vienna",
        )
        il.add_value("link", response.url)
        il.add_css("title", 'meta[property="og:title"]::attr(content)')
        if response.css(".article-origins .article-author-avatar"):
            # Blog posts.
            il.add_css("author_name", ".article-author-avatar > span ::text")
        else:
            # Normal articles.
            il.add_css("author_name", ".article-origins ::text")
        il.add_value("path", response.meta["ressort"])
        il.add_value("category", breadcrumbs.values())
        il.add_css("category", ".storylabels span ::text")
        il.add_css("updated", "time::attr('datetime')")
        il.add_css("content_html", ".article-subtitle")
        il.add_css("content_html", ".article-body")
        return il.load_item()
Ejemplo n.º 36
0
    def parse_item(self, response):
        if response.status == 404:
            self.logger.info("Article '{}' not available anymore.".format(
                response.url))
            return

        def _clean_caption(elem):
            if "–" in elem.text:
                # Caption is of the format "text - credit".
                elem.text = re.sub(r"\s*([^–]*).*", r"\1", elem.text)
                return elem
            else:
                # It's just the "credit", remove it.
                return None

        section = response.css('meta[name="kt:section-path"]::attr("content")'
                               ).extract_first()[1:]  # Skip the first /.
        if section not in self._sections and "all" not in self._sections:
            # Ignore the response as the ressort should not be parsed.
            return

        il = FeedEntryItemLoader(
            response=response,
            parent=response.meta["il"],
            remove_elems=[
                ".ad",
                ".article-paid",
                ".js-overlay-close",
                ".swiper-lazy-preloader",
            ],
            change_tags={".article__lead": "strong"},
            change_attribs={".zoomable__image--zoomed": {
                "data-src": "src"
            }},
            replace_elems={".article__media-caption": _clean_caption},
            base_url="https://www.{}".format(self.name),
        )
        il.add_css(
            "author_name",
            "article .article__author ::text",
            re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL),
        )
        il.add_css("content_html", "article .article__media .zoomable__inner")
        il.add_css("content_html",
                   "article .article__lead")  # change tags to strong
        il.add_css("content_html", "article .article__body")
        if response.css(".article-paid"):
            il.add_value("category", "paywalled")
        il.add_value("category", section.split("/"))
        if "all" in self._sections:
            il.add_value("path", "all")
        if section in self._sections:
            il.add_value("path", section)
        return il.load_item()
Ejemplo n.º 37
0
 def _parse_restaurant(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url=response.url,
         parent=response.meta["il"],
         remove_elems=[".external"],
     )
     il.add_css("content_html", ".content .right p")
     il.add_css("content_html", ".restaurant-link")
     il.add_css("category", ".tags a ::text")
     yield il.load_item()
Ejemplo n.º 38
0
 def _parse_restaurant(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url=response.url,
         parent=response.meta["il"],
         remove_elems=[".external"],
     )
     il.add_css("content_html", ".content .right p")
     il.add_css("content_html", ".restaurant-link")
     il.add_css("category", ".tags a ::text")
     yield il.load_item()
Ejemplo n.º 39
0
 def parse_item_text(self, response):
     remove_elems = [".ad-component", ".wp-caption-text"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         base_url="https://{}".format(self.name),
     )
     if response.css(".bluebox"):
         il.add_value("category", "paywalled")
     il.add_css("content_html", "div.pR")
     return il.load_item()
Ejemplo n.º 40
0
 def parse_item(self, response):
     remove_elems = ["h1", ".delayed-image-load"]
     change_tags = {"noscript": "div"}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         change_tags=change_tags,
         base_url="https://www.{}".format(self.name),
     )
     il.add_xpath("content_html", '//div[@id="main-inner"]')
     return il.load_item()
Ejemplo n.º 41
0
    def parse_item(self, response):
        if response.status == 404:
            self.logger.info("Article '{}' not available anymore.".format(response.url))
            return

        def _clean_caption(elem):
            if "–" in elem.text:
                # Caption is of the format "text - credit".
                elem.text = re.sub(r"\s*([^–]*).*", r"\1", elem.text)
                return elem
            else:
                # It's just the "credit", remove it.
                return None

        section = response.css(
            'meta[name="kt:section-path"]::attr("content")'
        ).extract_first()[
            1:
        ]  # Skip the first /.
        if section not in self._sections and "all" not in self._sections:
            # Ignore the response as the ressort should not be parsed.
            return

        il = FeedEntryItemLoader(
            response=response,
            parent=response.meta["il"],
            remove_elems=[
                ".ad",
                ".article-paid",
                ".js-overlay-close",
                ".swiper-lazy-preloader",
            ],
            change_tags={".article__lead": "strong"},
            change_attribs={".zoomable__image--zoomed": {"data-src": "src"}},
            replace_elems={".article__media-caption": _clean_caption},
            base_url="https://www.{}".format(self.name),
        )
        il.add_css(
            "author_name",
            "article .article__author ::text",
            re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL),
        )
        il.add_css("content_html", "article .article__media .zoomable__inner")
        il.add_css("content_html", "article .article__lead")  # change tags to strong
        il.add_css("content_html", "article .article__body")
        if response.css(".article-paid"):
            il.add_value("category", "paywalled")
        il.add_value("category", section.split("/"))
        if "all" in self._sections:
            il.add_value("path", "all")
        if section in self._sections:
            il.add_value("path", section)
        return il.load_item()
Ejemplo n.º 42
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url="{}/".format(self.feed_link),
         timezone="Europe/Vienna",
         dayfirst=True,
         remove_elems=[".ruler", "h1"],
     )
     il.add_css("title", "h1.event-title::text")
     il.add_value("link", response.url)
     il.add_css("content_html", "div#content.container")
     return il.load_item()
Ejemplo n.º 43
0
 def parse_item_text(self, response):
     remove_elems = [".dachzeile", "h1", ".meta", "br", "form", ".button-container"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         base_url="https://{}".format(self.name),
     )
     content = response.xpath("//article").extract_first()
     if "Lesen Sie diesen Artikel in voller Länge" in content:
         il.add_value("category", "paywalled")
     il.add_value("content_html", content)
     return il.load_item()
Ejemplo n.º 44
0
    def parse_item(self, response):
        il = FeedEntryItemLoader(response=response, base_url=self._base_url)
        il.add_value("updated", response.meta["updated"])
        il.add_value("author_name", response.meta["author_name"])
        il.add_value("link", response.url)
        il.add_css("title", "title::text", re="(.*) - The Oatmeal")
        il.add_value("category", urlsplit(response.url).path.strip("/").split("/")[0])

        # comics
        il.add_css("content_html", "#comic > img")
        il.add_css("content_html", "#comic > p > img")

        # blog
        il.add_css("content_html", "#blog .center_text img")
        return il.load_item()
Ejemplo n.º 45
0
    def parse(self, response):
        if len(response.css(".thumbnail")) == 0:
            self.logger.info("No items found.")
            return

        for item in response.css(".thumbnail"):
            il = FeedEntryItemLoader(selector=item, base_url=self._base_url)
            il.add_css("title", ".item_brand_text ::text")
            il.add_css("title", ".item-title ::text")
            il.add_css("title", ".current-price ::text")
            il.add_value(
                "link",
                response.urljoin(item.css(".item-link::attr(href)").extract_first()),
            )
            image_url = item.css(".item-image::attr(data-bg)").re_first(
                r"url\(([^)]+)\)"
            )
            # Fix broken images.
            if image_url.startswith("https://markenankauf.momox.de/pics/https://"):
                image_url = image_url.replace(
                    "https://markenankauf.momox.de/pics/https://", "https://"
                )
            il.add_value("content_html", '<img src="{}">'.format(image_url))
            il.add_css("content_html", ".item-des-container")
            il.add_value("path", response.meta["path"])
            yield il.load_item()

        page = int(response.css(".pagination .active a::text").extract_first())
        if page == 1:
            yield generate_feed_header(
                title=response.css("title ::text").re_first(
                    "(ubup | .*) Second Hand kaufen"
                ),
                subtitle="Deutschlands größter Second Hand-Onlineshop für "
                "Mode & Accessoires",
                icon="https://www.{}/images/favicon.ico".format(self.name),
                link=response.url,
                path=response.meta["path"],
            )
        if page < self._scrape_pages:
            next_page = response.css(
                ".pagination .active + li a::attr(href)"
            ).extract_first()
            if next_page:
                yield scrapy.Request(
                    response.urljoin(next_page),
                    meta={"dont_cache": True, "path": response.meta["path"]},
                )
Ejemplo n.º 46
0
    def parse_content(self, response):
        parts = self._extract_parts(response)
        il = FeedEntryItemLoader(
            response=response, timezone="Europe/Vienna", dayfirst=True
        )
        il.add_value("path", self._library)
        il.add_value("title", " - ".join(parts[: self._find_first_meta(parts)]))
        il.add_value("link", response.url)
        il.add_xpath("updated", "//td/span/text()", re="In der Bibliothek seit: (.*)")

        _content = ["<ul>"]
        for part in parts:
            _content.append("<li>{}</li>".format(part))
        _content.append("</ul>")
        il.add_value("content_html", "".join(_content))
        return il.load_item()
Ejemplo n.º 47
0
 def _parse_article(self, response):
     remove_elems = ["script"]
     convert_footnotes = [".footnoteContent"]
     pullup_elems = {".footnoteContent": 1}
     change_tags = {".entry-content-info-box": "blockquote"}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         change_tags=change_tags,
         base_url="https://{}".format(self.name),
         convert_footnotes=convert_footnotes,
         pullup_elems=pullup_elems,
     )
     il.add_css("content_html", ".entry-content")
     return il.load_item()
Ejemplo n.º 48
0
 def _parse_article(self, response):
     feed_entry = response.meta["feed_entry"]
     il = FeedEntryItemLoader(parent=response.meta["il"])
     doc = Document(response.text, url=response.url)
     il.add_value("title", doc.short_title() or feed_entry.get("title"))
     summary = feed_entry.get("summary")
     try:
         content = doc.summary(html_partial=True)
         if summary and len(summary) > len(content):
             # Something probably went wrong if the extracted content is shorter than
             # the summary.
             raise Unparseable
     except Unparseable:
         content = summary
     il.add_value("content_html", content)
     return il.load_item()
Ejemplo n.º 49
0
 def parse_lokalfuehrer(self, response):
     entries = json.loads(response.text)[0]["hits"]
     for entry in entries:
         il = FeedEntryItemLoader(
             response=response, base_url="https://{}".format(self.name)
         )
         il.add_value(
             "path", "lokalfuehrer_{}".format(response.meta["lokalfuehrer"])
         )
         il.add_value(
             "link", "https://www.{}/lokal/{}".format(self.name, entry["id"])
         )
         il.add_value("category", entry["categories"])
         il.add_value("category", entry["zip"])
         il.add_value("category", entry["city"])
         review = entry.get("review")
         if review:
             il.add_value("title", review["post_title"])
             il.add_value("title", review["post_subtitle"])
             il.add_value("author_name", review["meta"].split("|")[0].title())
             il.add_value("category", "review")
             il.add_value("updated", review["post_date"])
         else:
             il.add_value("title", entry["name"])
         if "pictures" in entry and entry["pictures"]:
             il.add_value(
                 "content_html",
                 '<img src="https://fcc.at/ef/img720/{}">'.format(
                     entry["pictures"][0]["filename"]
                 ),
             )
         if review:
             il.add_value("content_html", review["post_content"])
         il.add_value("content_html", entry["category_text"])
         il.add_value(
             "content_html",
             "<p>{} {}, {}</p>".format(entry["zip"], entry["city"], entry["street"]),
         )
         if entry["location"]:
             il.add_value(
                 "content_html",
                 (
                     '<p><a href="https://www.google.com/maps?q={lat},{lon}">'
                     + "Google Maps</a></p>"
                 ).format(**entry["location"]),
             )
         yield il.load_item()
Ejemplo n.º 50
0
    def _parse_episode(self, response):
        item = json.loads(response.text)
        il = FeedEntryItemLoader()
        il.add_value("title", item["title"])
        il.add_value(
            "content_html",
            '<img src="{}">'.format(item["playlist"]["preview_image_url"]),
        )
        if item["description"]:
            il.add_value("content_html", item["description"].replace("\r\n", "<br>"))
        il.add_value("updated", item["date"])
        il.add_value("link", item["url"].replace("api-tvthek.orf.at", "tvthek.orf.at"))
        # Check how many segments are part of this episode.
        if len(item["_embedded"]["segments"]) == 1:
            # If only one segment, item["sources"] contains invalid links.
            # We use the first embedded segment instead.
            # This is also how mediathekviewweb.de works.
            item["sources"] = item["_embedded"]["segments"][0]["sources"]
        try:
            video = next(
                s
                for s in item["sources"]["progressive_download"]
                if s["quality_key"] == "Q8C"
            )
            il.add_value("enclosure", {"iri": video["src"], "type": "video/mp4"})
        except StopIteration:
            self.logger.warning(
                "Could not extract video for '{}'!".format(item["title"])
            )
            raise DropResponse(
                "Skipping {} because not downloadable yet".format(response.url),
                transient=True,
            )

        subtitle = item["_embedded"].get("subtitle")
        if subtitle:
            subtitle = subtitle["_embedded"]["srt_file"]["public_urls"]["reference"]
            il.add_value("enclosure", {"iri": subtitle["url"], "type": "text/plain"})
        else:
            self.logger.debug("No subtitle file found for '{}'".format(item["url"]))
        il.add_value(
            "category",
            self._categories_from_oewa_base_path(
                item["_embedded"]["profile"]["oewa_base_path"]
            ),
        )
        return il.load_item()
Ejemplo n.º 51
0
 def _parse_item(self, response):
     remove_elems = [
         "h1",
         ".nono",
         ".acceptance_org",
         ".state",
         "script",
         ".gentics-portletreload-position-notvisibleposition",
     ]
     remove_elems_xpath = [
         """
         //div[
             @class='advice' and
             child::div[@class='advice_text' and (
                 contains(., 'nicht die aktuelle Rechtslage') or
                 contains(., 'wird nicht laufend aktualisiert') or
                 contains(., 'Übersicht über bisherige "Themen des Monats"')
             )]
         ]
         """,
         # Remove table of contents.
         "//li[child::a[starts-with(@href, '#')]]",
         "//ul[not(li)]",
     ]
     change_tags = {"abbr": "span"}
     il = FeedEntryItemLoader(
         response=response,
         timezone="Europe/Vienna",
         base_url="https://www.{}".format(self.name),
         remove_elems=remove_elems,
         remove_elems_xpath=remove_elems_xpath,
         change_tags=change_tags,
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "author_name",
         '//div[@class="acceptance_org"]/text()[preceding-sibling::br]',
     )
     il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)")
     il.add_value(
         "updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})")
     )
     il.add_css("content_html", ".Content")
     return il.load_item()
Ejemplo n.º 52
0
    def parse(self, response):
        mitteilungsblaetter = response.css(".mitteilungsblaetter")
        updated = mitteilungsblaetter.css("::text").re_first(r"(\d{2}\.\d{2}\.\d{4})")
        link = response.urljoin(
            mitteilungsblaetter.css('a::attr("href")').extract_first()
        )

        response = yield scrapy.Request(link, method="HEAD")
        mb_url = response.url
        match = re.search(
            r"https://tiss.tuwien.ac.at/mbl/blatt_struktur/anzeigen/(\d+)", mb_url
        )
        if not match:
            self.logger.error("No Mitteilungsblätter found!")
            return
        else:
            mb_id = match.group(1)

        url = "https://tiss.{}/api/mbl/v22/id/{}".format(self.name, mb_id)
        response = yield scrapy.Request(url)

        last_entry = None
        for entry in reversed(json.loads(response.text)["knoten"]):
            (entry["main"], entry["sub"]) = re.match(
                r"(\d+)\.?(\d*)", entry["counter"]
            ).groups()
            if last_entry is not None and last_entry["main"] == entry["main"]:
                entry["inhalt"] += "<h2>{}</h2>".format(last_entry["titel"])
                entry["inhalt"] += last_entry["inhalt"]
            if entry["sub"] == "":
                il = FeedEntryItemLoader(
                    base_url="https://tiss.{}".format(self.name),
                    timezone="Europe/Vienna",
                    dayfirst=True,
                )
                il.add_value("updated", updated)
                il.add_value("link", mb_url + "#{}".format(entry["counter"]))
                il.add_value("title", entry["titel"])
                il.add_value("content_html", entry["inhalt"])
                yield il.load_item()
                last_entry = None
            else:
                last_entry = entry
Ejemplo n.º 53
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@id="maincontentbook"]'),
         base_url=self.feed_link,
     )
     il.add_xpath("title", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()')
     il.add_value("link", response.url)
     il.add_value("author_name", self.feed_title)
     il.add_xpath("content_html", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("content_html", '//h2[@class="p_book_author"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_publisher"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_isbn"]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[1]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[3]/text()')
     il.add_xpath("content_html", '//div[@class="bookcontent"]//text()')
     il.add_xpath("content_html", '//div[@class="p_book_image"]/img')
     il.add_xpath("content_html", '//span[@style="color:red;"]/b/text()')
     return il.load_item()
Ejemplo n.º 54
0
    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url="{}/cms/".format(self.feed_link),
            timezone="Europe/Vienna",
            remove_elems=[".news-latest-date", ".news-single-rightbox", "hr", "h7"],
            remove_elems_xpath=[
                '//div[@class="news-single-item"]/b[1]',
                '//div[@class="news-single-item"]/br[1]',
            ],
            dayfirst=True,
        )

        il.add_value(
            "title", response.xpath("//head/title/text()").re_first(r"::: (.*)")
        )

        il.add_value("link", response.url)

        il.add_value(
            "updated",
            response.xpath('//div[@class="news-single-rightbox"]').re_first(
                r"(\d{2}\.\d{2}\.\d{4})"
            ),
        )

        il.add_value(
            "author_name",
            response.xpath('//head/meta[@name="publisher"]/@content').re_first(
                "recht.at, (.*);"
            ),
        )
        il.add_xpath("author_name", '//head/meta[@name="author"]/@content')
        il.add_value("author_name", self.name)

        il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content')

        il.add_css("content_html", ".news-single-item h7 font strong")
        il.add_css("content_html", ".news-single-item")

        return il.load_item()
Ejemplo n.º 55
0
 def _parse_interview(self, response):
     remove_elems = [
         ".shareable-quote",
         ".share-bar",
         # Remove the last two h2s and all paragraphs below.
         ".interview-body > h2:last-of-type ~ p",
         ".interview-body > h2:last-of-type",
         ".interview-body > h2:last-of-type ~ p",
         ".interview-body > h2:last-of-type",
     ]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_value("link", response.url)
     il.add_css("title", "h1::text")
     il.add_css("author_name", "header .user-link__name::text")
     il.add_css("content_html", ".interview-body")
     il.add_value("updated", response.meta["updated"])
     return il.load_item()
Ejemplo n.º 56
0
 def parse_program(self, response):
     if not response.css(r".jsb_video\/FlashPlayer"):
         return
     data = json.loads(
         response.css(r".jsb_video\/FlashPlayer").xpath("@data-jsb").extract()[0]
     )
     data = data["config"]["initial_video"]["parts"][0]["tracking"]["nurago"]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", data["clipurl"])
     il.add_value("title", data["programname"])
     il.add_value("updated", data["airdate"])
     il.add_xpath("content_html", '//p[@class="plot_summary"]')
     item = il.load_item()
     # Only include videos posted in the last 7 days.
     if item["updated"] + self._timerange > datetime.now(timezone.utc):
         return item
Ejemplo n.º 57
0
    def _parse_article_url(self, response):
        if not response.css("#content"):
            raise DropResponse(
                "Skipping {} since it is empty".format(response.url), transient=True
            )

        if "Fehler" in response.css("h2 ::text").extract_first():
            raise DropResponse(
                "Skipping {} since it returned an error".format(response.url),
                transient=True,
            )

        remove_elems = ['div[style="padding-top:10px;"]']
        il = FeedEntryItemLoader(
            response=response,
            timezone="Europe/Vienna",
            base_url="https://{}".format(self.name),
            dayfirst=True,
            remove_elems=remove_elems,
        )
        il.add_value("link", response.url)
        il.add_value("author_name", "VKI")
        date = response.css(".issue").re_first(
            r"veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})"
        )
        il.add_value("updated", date)
        url = response.xpath('//a[text()="Druckversion"]/@onclick').re_first(
            r"window\.open\('(.*)'\);"
        )
        il.add_css("title", "h1::text")
        if url:
            return scrapy.Request(
                response.urljoin(url), callback=self._parse_article, meta={"il": il}
            )
        else:
            il.add_value("category", "paywalled")
            il.add_css("content_html", ".primary")
            il.add_css("content_html", 'div[style="padding-top:10px;"] > h3')
            return il.load_item()
Ejemplo n.º 58
0
 def parse(self, response):
     articles = json.loads(response.text)
     remove_elems = [
         "hr + p",
         "hr",
         "iframe",
         "p i:last-of-type:contains('Facebook'):contains('Twitter')",
     ]
     for article in articles:
         il = FeedEntryItemLoader(timezone="UTC", remove_elems=remove_elems)
         il.add_value("title", article["title"])
         il.add_value("link", article["url"])
         if "thumbnail_url_1_1" in article:
             il.add_value(
                 "content_html",
                 '<img src="{}">'.format(article["thumbnail_url_1_1"]),
             )
         il.add_value("content_html", article["body"])
         il.add_value(
             "updated", datetime.utcfromtimestamp(article["publish_date"] / 1000)
         )
         il.add_value(
             "author_name",
             [
                 contribution["contributor"]["full_name"]
                 for contribution in article["contributions"]
             ],
         )
         il.add_value("category", article["channel"]["name"])
         for topic in article["topics"] + [article["primary_topic"]]:
             if topic and "name" in topic:
                 il.add_value("category", topic["name"].title())
         if article["nsfw"]:
             il.add_value("category", "nsfw")
         if article["nsfb"]:
             il.add_value("category", "nsfb")
         il.add_value("path", response.meta["locale"])
         yield il.load_item()