Esempio n. 1
0
 def parse_item(self, response):
     remove_elems = [
         'aside',
         'script',
         'h1',
         '.breadcrumbs',
         '.author-date',
         '.artikel-social-kommentar',
         '.bild-copyright',
         '.ressortTitleMobile',
         '.article-number',
         '.artikel-kommentarlink',
         '.umfrage-wrapper',
         '.articleIssueInfo',
     ]
     il = FeedEntryItemLoader(response=response,
                              timezone=self._timezone,
                              base_url='http://{}'.format(self.name),
                              remove_elems=remove_elems)
     il.add_value('link', response.url)
     author_name = (
         response.css('.author-date ::text').re(r'(?:Von)?\s*(\w+ \w+)')
         or 'Red.')
     il.add_value('author_name', author_name)
     il.add_css('title', 'h1[itemprop="headline"]::text')
     il.add_css('updated',
                'meta[property="article:published_time"]::attr(content)',
                re='([^+]*)')
     il.add_css('content_html', 'article')
     yield il.load_item()
Esempio n. 2
0
    def parse_album(self, response):
        def _replace_track_info(elem):
            parts = list(
                map(lambda x: x.text_content().strip(), elem.getchildren()))
            return '<p>{} <i>({})</i></p>'.format(parts[0], parts[1])

        title = response.xpath('//h1[@class="c-product-block__title"]//text()'
                               ).extract()[-1].strip()
        artist = response.xpath(
            '//div[contains(@class,"c-product-block__contributors")]/p/text()'
        ).re_first('[^,]+')
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}/".format(self.name),
            remove_elems=[
                '.c-product-block__title', '.c-product__product-purchase',
                '.c-track__format-specific-info', '.c-track__duration',
                '.c-track__details', '.c-tracklist__initial-tracks',
                '.c-tabs-block__tabs-links', 'button'
            ],
            replace_elems={'.c-track__all-format-info': _replace_track_info})
        il.add_value("title", '{} - {}'.format(artist, title))
        il.add_value("link", response.url)
        il.add_value("author_name", 'bot')
        il.add_css("content_html", 'div.c-page--product')
        return il.load_item()
Esempio n. 3
0
 def _parse_article(self, response):
     remove_elems = [
         ".caption-credit",
         ".gallery-image-credit",
         "#social-left",
         "ul.toc",
         "h3:contains('Table of Contents')",
         "br",
         ".sidebar:contains('Further Reading')",
         ".credit",
     ]
     change_tags = {".sidebar": "blockquote", "aside": "blockquote"}
     replace_elems = {"div.image": self._div_to_img}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         replace_elems=replace_elems,
         change_tags=change_tags,
     )
     if response.meta.get("first_page", False):
         il.add_value("link", response.url)
         il.add_css("author_name", ".byline a span ::text")
         il.add_css("content_html", "header h2")
         il.add_value("path", response.meta["path"])
     il.add_css("content_html", ".article-content")
     if response.css(".next"):
         return scrapy.Request(
             response.css(".numbers a::attr(href)").extract()[-1],
             self._parse_article,
             meta={"il": il, "path": response.meta["path"]},
         )
     else:
         return il.load_item()
Esempio n. 4
0
    def parse(self, response):
        json_response = json.loads(response.text)

        if 'next' in json_response['_links']:
            yield Request(json_response['_links']['nextPage'],
                          meta={'dont_cache': True})

        for item in json_response['_embedded']['items']:
            il = FeedEntryItemLoader(response=response,
                                     timezone=self._timezone,
                                     dayfirst=False)
            il.add_value('title', item['title'])
            il.add_value(
                'content_html',
                '<img src="{}">'.format(item['playlist']['preview_image_url']))
            if item['description']:
                il.add_value('content_html',
                             item['description'].replace('\r\n', '<br>'))
            il.add_value('updated', item['date'])
            il.add_value(
                'link', item['url'].replace('api-tvthek.orf.at',
                                            'tvthek.orf.at'))
            yield Request(item['_links']['profile']['href'],
                          self._parse_profile,
                          meta={'item': il},
                          dont_filter=True)
Esempio n. 5
0
 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta["il"],
                              base_url=self._base_url)
     il.add_value("content_html", "<h1>Detailed Changelog</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return il.load_item()
Esempio n. 6
0
    def _parse_article(self, response):
        def _fix_img_src(elem):
            if "data-original" in elem.attrib:
                elem.attrib["src"] = elem.attrib["data-original"]
            return elem

        remove_elems = [
            ".credit",
            ".hide-caption",
            ".toggle-caption",
            ".enlarge-options",
            ".enlarge_measure",
            ".enlarge_html",
            ".ad-backstage",
            'p:first-of-type:contains("Editor\'s Note: This is an excerpt of")',
            'p:contains("Did you enjoy this newsletter segment?")',
        ]
        replace_elems = {"img": _fix_img_src}
        change_tags = {".image": "figure", ".credit-caption": "figcaption"}

        il = FeedEntryItemLoader(
            response=response,
            base_url=self._base_url,
            remove_elems=remove_elems,
            replace_elems=replace_elems,
            change_tags=change_tags,
        )
        il.add_css("title", "h1 ::text")
        il.add_value("link", response.url)
        il.add_css("content_html", "#storytext")
        il.add_value("path", response.meta["path"])
        il.add_css("updated", '.dateblock time::attr("datetime")')
        il.add_css("author_name", ".byline__name a::text")

        yield il.load_item()
Esempio n. 7
0
 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(
         response=response, parent=response.meta["il"], base_url=self._base_url
     )
     il.add_value("content_html", "<h1>Detailed Changelog</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return il.load_item()
Esempio n. 8
0
    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url='{}/cms/'.format(self._link),
            timezone=self._timezone,
            remove_elems=['.news-latest-date', '.news-single-rightbox', 'hr',
                          'h7'],
            remove_elems_xpath=['//div[@class="news-single-item"]/b[1]',
                                '//div[@class="news-single-item"]/br[1]'],
        )

        il.add_value(
            'title',
            response.xpath('//head/title/text()').re_first(r'::: (.*)'))

        il.add_value('link', response.url)

        il.add_value(
            'updated',
            response.xpath('//div[@class="news-single-rightbox"]').
            re_first(r'(\d{2}\.\d{2}\.\d{4})'))

        il.add_value(
            'author_name',
            response.xpath('//head/meta[@name="publisher"]/@content').
            re_first('recht.at, (.*);'))
        il.add_xpath('author_name', '//head/meta[@name="author"]/@content')
        il.add_value('author_name', self.name)

        il.add_xpath('author_email', '//head/meta[@name="reply-to"]/@content')

        il.add_css('content_html', '.news-single-item h7 font strong')
        il.add_css('content_html', '.news-single-item')

        yield il.load_item()
Esempio n. 9
0
    def _parse_article(self, response):
        if response.status == 410:
            # Articles has been deleted.
            return

        remove_elems = [".bildtext .author", "iframe"]
        change_tags = {"h1": "h2", ".bildbox": "figure", ".bildtext": "figcaption"}
        il = FeedEntryItemLoader(
            response=response,
            timezone="Europe/Vienna",
            base_url="https://www.{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
            dayfirst=True,
            yearfirst=False,
        )
        if response.css(".payment"):
            il.add_value("category", "paywalled")
        il.add_css("link", 'link[rel="canonical"]::attr(href)')
        il.add_css("title", 'meta[property="og:title"]::attr(content)')
        il.add_css("author_name", ".druckheadline::text", re=r"·\s*(.*)\s*·")
        # Mon, 01 Oct 18 13:42:45 +0200
        il.add_css("updated", 'meta[http-equiv="last-modified"]::attr(content)')
        il.add_css("content_html", ".druckcontent")
        il.add_value("path", response.meta["ressort"])
        return il.load_item()
Esempio n. 10
0
    def _parse_article(self, response):
        if response.status == 410:
            # Articles has been deleted.
            return

        remove_elems = [
            '.bildtext .author', 'iframe',
        ]
        change_tags = {
            'h1': 'h2'
        }
        il = FeedEntryItemLoader(response=response,
                                 timezone=self._timezone,
                                 base_url='https://www.{}'.format(self.name),
                                 remove_elems=remove_elems,
                                 change_tags=change_tags,
                                 dayfirst=False,
                                 yearfirst=False)
        if response.css('.payment'):
            il.add_value('category', 'paywalled')
        il.add_css('link', 'link[rel="canonical"]::attr(href)')
        il.add_css('title', 'meta[property="og:title"]::attr(content)')
        il.add_css('author_name', '.druckheadline::text',
                   re='·\s*(.*)\s*·')
        il.add_css('updated',
                   'meta[http-equiv="last-modified"]::attr(content)')
        il.add_css('content_html', '.druckcontent')
        il.add_value('path', response.meta['ressort'])
        yield il.load_item()
Esempio n. 11
0
 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta['il'],
         base_url=self._base_url,
     )
     il.add_value('content_html', '<h1>Detailed Changelog</h1>')
     il.add_xpath('content_html', '//h1/following-sibling::*')
     yield il.load_item()
Esempio n. 12
0
 def parse_letter(self, response):
     account = response.meta["account"]
     il = FeedEntryItemLoader(response=response, base_url=self._links.get(account))
     il.add_value("path", account)
     il.add_value("link", response.url)
     il.add_css("title", "title::text")
     il.add_css("author_name", "div#message-heading div.by-line a::text")
     il.add_css("updated", "div#message-heading div.date::text")
     il.add_css("content_html", "div.message-body")
     yield il.load_item()
Esempio n. 13
0
    def _parse_article(self, response):
        def _fix_img_src(elem):
            if "src" not in elem.attrib:
                if "data-lazy-src" in elem.attrib:
                    elem.attrib["src"] = elem.attrib["data-lazy-src"]
                elif "data-src" in elem.attrib:
                    elem.attrib["src"] = elem.attrib["data-src"]
            return elem

        def _parse_breadcrumbs(breadcrumbs):
            links = breadcrumbs.css("a::text, a::attr('href')").extract()
            return {k[1:]: v for k, v in zip(links[::2], links[1::2])}

        breadcrumbs = _parse_breadcrumbs(
            response.css(".site-contextnavigation-breadcrumbs-nav a")
        )
        self._titles = {**self._titles, **breadcrumbs}

        remove_elems = [
            "ad-container",
            "figure > footer",
            "picture > button",
            "div[data-section-type='newsletter']",
            ".gallery-summary",
        ]
        change_tags = {
            ".article-subtitle": "strong",
            "aside": "blockquote",
            "p strong:only-child": "h3",
        }
        replace_elems = {"img": _fix_img_src}
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
            replace_elems=replace_elems,
            timezone="Europe/Vienna",
        )
        il.add_value("link", response.url)
        il.add_css("title", 'meta[property="og:title"]::attr(content)')
        if response.css(".article-origins .article-author-avatar"):
            # Blog posts.
            il.add_css("author_name", ".article-author-avatar > span ::text")
        else:
            # Normal articles.
            il.add_css("author_name", ".article-origins ::text")
        il.add_value("path", response.meta["ressort"])
        il.add_value("category", breadcrumbs.values())
        il.add_css("category", ".storylabels span ::text")
        il.add_css("updated", "time::attr('datetime')")
        il.add_css("content_html", ".article-subtitle")
        il.add_css("content_html", ".article-body")
        return il.load_item()
Esempio n. 14
0
 def parse_node(self, response, node):
     il = FeedEntryItemLoader(selector=node)
     url = node.xpath("link/text()").extract_first()
     il.add_value("link", url)
     il.add_xpath("updated", "pubDate/text()")
     il.add_xpath(
         "title",
         "title/text()",
         # Use re.DOTALL since some titles have newlines in them.
         re=re.compile("(?:Artikel|Tagebuch): (.*)", re.DOTALL),
     )
     return scrapy.Request(url, self._parse_article, meta={"il": il})
Esempio n. 15
0
 def parse_node(self, response, node):
     url = node.xpath("rss:loc/text()").extract_first()
     il = FeedEntryItemLoader(selector=node)
     il.add_value("link", url)
     il.add_xpath("title", "news:news/news:title/text()")
     keywords = node.xpath("news:news/news:keywords/text()").extract_first()
     if keywords:
         il.add_value("category", keywords.split(", "))
     il.add_xpath("updated", "news:news/news:publication_date/text()")
     return scrapy.Request(
         url, self.parse_item, meta={"il": il, "handle_httpstatus_list": [404]}
     )
Esempio n. 16
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url="{}/".format(self.feed_link),
         timezone="Europe/Vienna",
         dayfirst=True,
         remove_elems=[".ruler", "h1"],
     )
     il.add_css("title", "h1.event-title::text")
     il.add_value("link", response.url)
     il.add_css("content_html", "div#content.container")
     return il.load_item()
Esempio n. 17
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url="{}/".format(self.feed_link),
         timezone="Europe/Vienna",
         dayfirst=True,
         remove_elems=[".ruler", "h1"],
     )
     il.add_css("title", "h1.event-title::text")
     il.add_value("link", response.url)
     il.add_css("content_html", "div#content.container")
     return il.load_item()
Esempio n. 18
0
 def parse_item_text(self, response):
     remove_elems = [".ad-component", ".wp-caption-text"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         base_url="https://{}".format(self.name),
     )
     if response.css(".bluebox"):
         il.add_value("category", "paywalled")
     il.add_css("content_html", "div.pR")
     return il.load_item()
Esempio n. 19
0
    def _parse_weekly_edition(self, response):
        remove_elems = ["h1"]
        change_tags = {
            ".Cat1HL": "h1",
            ".Cat2HL": "h2",
            ".Cat3HL": "h3",
            ".SummaryHL": "h4",
        }
        il = FeedEntryItemLoader(
            response=response,
            parent=response.meta["il"],
            change_tags=change_tags,
            remove_elems=remove_elems,
            base_url=f"https://{self.name}",
        )

        for url in response.css("h2.SummaryHL a::attr(href)").extract():
            yield scrapy.Request(
                response.urljoin(url),
                self._parse_article,
                meta={
                    "il": None,
                    "updated": response.meta["updated"]
                },
            )

        # Remove articles that have their own page.
        text = []
        in_article = False
        for line in response.css(".ArticleText").extract_first().splitlines(
                True):
            # Beginning of article.
            if '<h2 class="SummaryHL"><a href="/Articles/' in line:
                in_article = True
            if not in_article:
                text.append(line)
            # End of article. Note that the links to the comments doesn't
            # always include "#comments" so we can't check for that.
            if '">Comments (' in line:
                in_article = False
        text = "".join(text)

        # Remove page editor.
        text = re.sub(r"<b>Page editor</b>: .*", "", text)

        # Recursively remove headings with no content.
        text = _remove_empty_headings(text)

        il.add_css("title", "h1::text")
        il.add_value("content_html", text)
        il.add_value("link", response.url)
        yield il.load_item()
Esempio n. 20
0
 def parse_item_text(self, response):
     remove_elems = [
         '.dachzeile', 'h1', '.meta', 'br', 'form', '.button-container'
     ]
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta['il'],
                              remove_elems=remove_elems,
                              base_url='http://{}'.format(self.name))
     content = response.xpath('//article').extract_first()
     if 'Lesen Sie diesen Artikel in voller Länge' in content:
         il.add_value('category', 'paywalled')
     il.add_value('content_html', content)
     yield il.load_item()
Esempio n. 21
0
 def parse_item_text(self, response):
     remove_elems = [".dachzeile", "h1", ".meta", "br", "form", ".button-container"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         base_url="https://{}".format(self.name),
     )
     content = response.xpath("//article").extract_first()
     if "Lesen Sie diesen Artikel in voller Länge" in content:
         il.add_value("category", "paywalled")
     il.add_value("content_html", content)
     return il.load_item()
Esempio n. 22
0
 def parse(self, response):
     m = re.search("window.DELINSKI, {listViewEntities: (.*)}", response.text)
     restaurants = sorted(
         json.loads(m.group(1))["restaurants"]["entities"].values(),
         key=lambda r: int(r["created"]),
         reverse=True,
     )
     for restaurant in restaurants[:20]:
         il = FeedEntryItemLoader(timezone="UTC", base_url=response.url)
         url = response.urljoin(restaurant["url"])
         il.add_value("link", url)
         il.add_value("title", restaurant["name"])
         content = """
         <img src="{image}">
         <ul>
             <li>{address}</li>
             <li>{price_range_human}</li>
             <li>{cuisine_text}</li>
         </ul>
         """
         il.add_value("content_html", content.format(**restaurant))
         il.add_value(
             "updated", datetime.utcfromtimestamp(int(restaurant["created"]))
         )
         yield scrapy.Request(url, self._parse_restaurant, meta={"il": il})
Esempio n. 23
0
 def parse_release_notes(self, response):
     il = FeedEntryItemLoader(
         response=response, timezone="Europe/Berlin", base_url=self._base_url
     )
     il.add_xpath("title", "//h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("updated", '//div[@class="docInfo"]', re="Last modified: (.*) by")
     il.add_value("content_html", "<h1>Release Notes</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return scrapy.Request(
         response.url.replace("notes-", "changelog-"),
         self.parse_release_changelog,
         meta={"il": il},
     )
Esempio n. 24
0
    def _parse_article(self, response):
        feed_entry = response.meta["feed_entry"]

        il = FeedEntryItemLoader(parent=response.meta["il"])
        try:
            response.text
        except AttributeError:
            # Response is not text (e.g. PDF, ...).
            il.add_value("title", feed_entry.get("title"))
            il.add_value("content_html", feed_entry.get("summary"))
            return il.load_item()

        doc = Document(response.text, url=response.url)
        il.add_value("title", doc.short_title() or feed_entry.get("title"))
        summary = feed_entry.get("summary")
        try:
            content = doc.summary(html_partial=True)
            if summary and len(summary) > len(content):
                # Something probably went wrong if the extracted content is shorter than
                # the summary.
                raise Unparseable
        except Unparseable:
            content = summary
        il.add_value("content_html", content)

        return il.load_item()
Esempio n. 25
0
 def parse(self, response):
     m = re.search("window.DELINSKI, {listViewEntities: (.*)}",
                   response.text)
     restaurants = sorted(
         json.loads(m.group(1))["restaurants"]["entities"].values(),
         key=lambda r: int(r["created"]),
         reverse=True,
     )
     for restaurant in restaurants[:20]:
         il = FeedEntryItemLoader(timezone="UTC", base_url=response.url)
         url = response.urljoin(restaurant["url"])
         il.add_value("link", url)
         il.add_value("title", restaurant["name"])
         content = """
         <img src="{image}">
         <ul>
             <li>{address}</li>
             <li>{price_range_human}</li>
             <li>{cuisine_text}</li>
         </ul>
         """
         il.add_value("content_html", content.format(**restaurant))
         il.add_value("updated",
                      datetime.utcfromtimestamp(int(restaurant["created"])))
         yield scrapy.Request(url, self._parse_restaurant, meta={"il": il})
Esempio n. 26
0
    def _parse_video_page(self, response):
        match = re.search(
            r"https?://(?:www\.)?servustv\.com/videos/(?P<id>[aA]{2}-\w+|\d+-\d+)",
            response.url,
        )
        if not match:
            return
        video_id = match.group("id").upper()

        il = FeedEntryItemLoader(response=response)
        il.add_value("link", response.url)
        section = response.css(
            "meta[property='article:section']::attr('content')").extract_first(
            )
        if section != "Allgemein":
            il.add_value("title", section)
        il.add_css("title", "title::text", re="(.*) - Servus TV")
        image_url = response.css(
            "meta[property='og:image']::attr('content')").extract_first()
        il.add_value("content_html", '<img src="{}">'.format(image_url))
        il.add_css("content_html",
                   "meta[property='og:description']::attr('content')")
        il.add_css("content_html", "#media-asset-content-container")

        match = re.search(r'"dateModified":\s*"([^"]+)"', response.text)
        if match:
            il.add_value("updated", match.group(1))

        stream_url = "https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8" % video_id

        yield Request(stream_url, self._parse_stream, meta={"il": il})
Esempio n. 27
0
 def parse_release_notes(self, response):
     il = FeedEntryItemLoader(
         response=response,
         timezone="Europe/Berlin",
         base_url=self.feed_link,
         remove_elems=[".cookielaw-banner"],
     )
     il.add_xpath("title", "//h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("updated",
                  '//div[@class="docInfo"]',
                  re="Last modified: (.*) by")
     il.add_value("content_html", "<h1>Release Notes</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return il.load_item()
Esempio n. 28
0
 def parse_node(self, response, node):
     url = node.xpath("rss:loc/text()").extract_first()
     il = FeedEntryItemLoader(selector=node)
     il.add_value("link", url)
     il.add_xpath("title", "news:news/news:title/text()")
     keywords = node.xpath("news:news/news:keywords/text()").extract_first()
     if keywords:
         il.add_value("category", keywords.split(", "))
     il.add_xpath("updated", "news:news/news:publication_date/text()")
     return scrapy.Request(url,
                           self.parse_item,
                           meta={
                               "il": il,
                               "handle_httpstatus_list": [404]
                           })
Esempio n. 29
0
    def _parse_weekly_edition(self, response):
        remove_elems = ['h1']
        change_tags = {
            '.Cat1HL': 'h1',
            '.Cat2HL': 'h2',
            '.Cat3HL': 'h3',
            '.SummaryHL': 'h4',
        }
        il = FeedEntryItemLoader(response=response,
                                 parent=response.meta['il'],
                                 change_tags=change_tags,
                                 remove_elems=remove_elems,
                                 base_url='https://{}'.format(self.name))

        for url in response.css('h2.SummaryHL a::attr(href)').extract():
            yield scrapy.Request(response.urljoin(url),
                                 self._parse_article,
                                 meta={
                                     'il': None,
                                     'updated': response.meta['updated']
                                 })

        # Remove articles that have their own page.
        text = []
        in_article = False
        for line in (
                response.css('.ArticleText').extract_first().splitlines(True)):
            # Beginning of article.
            if '<h2 class="SummaryHL"><a href="/Articles/' in line:
                in_article = True
            if not in_article:
                text.append(line)
            # End of article. Note that the links to the comments doesn't
            # always include "#comments" so we can't check for that.
            if '">Comments (' in line:
                in_article = False
        text = ''.join(text)

        # Remove page editor.
        text = re.sub(r'<b>Page editor</b>: .*', '', text)

        # Recursively remove headings with no content.
        text = _remove_empty_headings(text)

        il.add_css('title', 'h1::text')
        il.add_value('content_html', text)
        il.add_value('link', response.url)
        yield il.load_item()
Esempio n. 30
0
 def _parse_article(self, response):
     remove_elems = [
         '.projectNav',
         'h1',
         '.socialMedia__headline',
         '.whyRead',
         '.overlayCTA',
         '.authors',
         '.socialMedia',
         '.sidebar',
         '.sectionBackground--colorTheme1',
         '.heroStage__copyright',
         '.heroStage__downLink',
         'script',
         'iframe',
         '.image__zoom ',
         '.image__copyrightWrapper',
         '.callToAction',
         '.print-action',
         '.internalLink span',
     ]
     change_tags = {
         'div.heroStage__introText': 'strong',
         'figcaption': 'i',
         'figure': 'div'
     }
     replace_regex = {
         r'<span data-src="([^"]+)"></span>.*?' + r'<span data-src="([^"]+)" data-min-width="1000">':
         r'<a href="\2"><img src="\1"></a>',
         r'<div style=".*?"><video.*?></video>.*?</div></div>':
         '<em>Das eingebettete Video ist nur im Artikel verfügbar.</em>',
     }
     il = FeedEntryItemLoader(response=response,
                              timezone=self._timezone,
                              base_url='https://www.{}'.format(self.name),
                              remove_elems=remove_elems,
                              change_tags=change_tags,
                              replace_regex=replace_regex)
     il.add_value('link', response.url)
     il.add_value('author_name', 'Addendum')
     il.add_css('title', 'meta[property="og:title"]::attr(content)')
     il.add_css('updated',
                'meta[property="article:modified_time"]::attr(content)')
     # If not yet modified:
     il.add_css('updated',
                'meta[property="article:published_time"]::attr(content)')
     il.add_css('content_html', '.content')
     yield il.load_item()
Esempio n. 31
0
 def parse_release_notes(self, response):
     il = FeedEntryItemLoader(response=response,
                              timezone="Europe/Berlin",
                              base_url=self._base_url)
     il.add_xpath("title", "//h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("updated",
                  '//div[@class="docInfo"]',
                  re="Last modified: (.*) by")
     il.add_value("content_html", "<h1>Release Notes</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return scrapy.Request(
         response.url.replace("notes-", "changelog-"),
         self.parse_release_changelog,
         meta={"il": il},
     )
Esempio n. 32
0
 def _parse_article(self, response):
     feed_entry = response.meta["feed_entry"]
     il = FeedEntryItemLoader(parent=response.meta["il"])
     doc = Document(response.text, url=response.url)
     il.add_value("title", doc.short_title() or feed_entry.get("title"))
     summary = feed_entry.get("summary")
     try:
         content = doc.summary(html_partial=True)
         if summary and len(summary) > len(content):
             # Something probably went wrong if the extracted content is shorter than
             # the summary.
             raise Unparseable
     except Unparseable:
         content = summary
     il.add_value("content_html", content)
     return il.load_item()
Esempio n. 33
0
 def parse_release_notes(self, response):
     il = FeedEntryItemLoader(
         response=response,
         timezone=self._timezone,
         base_url=self._base_url,
     )
     il.add_xpath('title', '//h1/text()')
     il.add_value('link', response.url)
     il.add_xpath('updated',
                  '//div[@class="docInfo"]',
                  re='Last modified: (.*) by')
     il.add_value('content_html', '<h1>Release Notes</h1>')
     il.add_xpath('content_html', '//h1/following-sibling::*')
     yield scrapy.Request(response.url.replace('notes-', 'changelog-'),
                          self.parse_release_changelog,
                          meta={'il': il})
Esempio n. 34
0
 def _parse_item(self, response):
     remove_elems = [
         "h1",
         ".nono",
         ".acceptance_org",
         ".state",
         "script",
         ".gentics-portletreload-position-notvisibleposition",
     ]
     remove_elems_xpath = [
         """
         //div[
             @class='advice' and
             child::div[@class='advice_text' and (
                 contains(., 'nicht die aktuelle Rechtslage') or
                 contains(., 'wird nicht laufend aktualisiert') or
                 contains(., 'Übersicht über bisherige "Themen des Monats"')
             )]
         ]
         """,
         # Remove table of contents.
         "//li[child::a[starts-with(@href, '#')]]",
         "//ul[not(li)]",
     ]
     change_tags = {"abbr": "span"}
     il = FeedEntryItemLoader(
         response=response,
         timezone="Europe/Vienna",
         base_url="https://www.{}".format(self.name),
         remove_elems=remove_elems,
         remove_elems_xpath=remove_elems_xpath,
         change_tags=change_tags,
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "author_name",
         '//div[@class="acceptance_org"]/text()[preceding-sibling::br]',
     )
     il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)")
     il.add_value(
         "updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})")
     )
     il.add_css("content_html", ".Content")
     return il.load_item()
Esempio n. 35
0
 def _parse_article(self, response):
     remove_elems = [
         ".noprint",
         "form",
         "font[size='3'] > b",
         "font[size='2'] > b:first-child",
         'a[href="mailto:[email protected]"]',
         "br:first-child",
         "br:first-child",
         "br:first-child",
         "br:first-child",
         "br:first-child",
         "br:first-child",
         "br:last-child",
         "br:last-child",
         "br:last-child",
         "br:last-child",
         "br:last-child",
         "br:last-child",
     ]
     replace_regex = {
         r"\[\d{2}\.\d{2}\.\d{4}\]": "",
         # A0 is a non-breaking space in latin1.
         "\xA0": "",
         r"<br>\s*<br>\s*\d{1,2}\.\d{1,2}\.\d{4}\s*<br>": "",
     }
     change_attribs = {"font": {"size": None, "face": None, "color": None}}
     change_tags = {"font": "div", "center": "div"}
     il = FeedEntryItemLoader(
         response=response,
         base_url=response.url,
         remove_elems=remove_elems,
         replace_regex=replace_regex,
         change_attribs=change_attribs,
         change_tags=change_tags,
         parent=response.meta["il"],
     )
     il.add_css("author_name", ".sidebar .authors__name::text")
     if response.css(".printwidth2"):
         il.add_css("content_html", ".printwidth2")
     else:
         # Tagebuch
         il.add_css("content_html", ".lineall")
         il.add_value("category", "Tagebuch")
     return il.load_item()
Esempio n. 36
0
    def _parse_article(self, response):
        def _fix_img_src(elem):
            if "data-src" in elem.attrib:
                elem.attrib["src"] = elem.attrib["data-src"]
            return elem

        if response.status == 410:
            # Articles has been deleted.
            return

        remove_elems = [
            ".artDetail__header__container",
            ".artDetail__extImage__copyright",
            "#readspeaker_button1",
            ".artDetail__userOptions",
            ".container__col--hide",
            ".container__col--mdHide",
            ".artDetailMeineThemen__outer",
            ".artDetailAutor__outer",
            ".artDetailMehrZu",
            "div[style='display: none;']",
            ".artDetail__ooenplusOverlay",
        ]
        replace_elems = {"img": _fix_img_src}
        il = FeedEntryItemLoader(
            response=response,
            timezone="Europe/Vienna",
            base_url="https://www.{}".format(self.name),
            remove_elems=remove_elems,
            replace_elems=replace_elems,
            dayfirst=True,
            yearfirst=False,
        )
        if response.css(".mainLogin__linkToggle"):
            il.add_value("category", "paywalled")
        il.add_css("link", 'link[rel="canonical"]::attr(href)')
        il.add_css("title", 'meta[property="og:title"]::attr(content)')
        il.add_css("author_name", ".artDetailAutor__headline::text")
        # Mon, 01 Oct 18 13:42:45 +0200
        il.add_css("updated", 'meta[name="date"]::attr(content)')
        il.add_css("content_html", "article.artDetail")
        il.add_css("category", ".artDetailOrt__linkText::text")
        il.add_value("path", response.meta["ressort"])
        return il.load_item()
Esempio n. 37
0
 def _parse_item(self, response):
     remove_elems = [
         "h1",
         ".nono",
         ".acceptance_org",
         ".state",
         "script",
         ".gentics-portletreload-position-notvisibleposition",
     ]
     remove_elems_xpath = [
         """
         //div[
             @class='advice' and
             child::div[@class='advice_text' and (
                 contains(., 'nicht die aktuelle Rechtslage') or
                 contains(., 'wird nicht laufend aktualisiert') or
                 contains(., 'Übersicht über bisherige "Themen des Monats"')
             )]
         ]
         """,
         # Remove table of contents.
         "//li[child::a[starts-with(@href, '#')]]",
         "//ul[not(li)]",
     ]
     change_tags = {"abbr": "span"}
     il = FeedEntryItemLoader(
         response=response,
         timezone="Europe/Vienna",
         base_url="https://www.{}".format(self.name),
         remove_elems=remove_elems,
         remove_elems_xpath=remove_elems_xpath,
         change_tags=change_tags,
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "author_name",
         '//div[@class="acceptance_org"]/text()[preceding-sibling::br]',
     )
     il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)")
     il.add_value("updated",
                  response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})"))
     il.add_css("content_html", ".Content")
     return il.load_item()
Esempio n. 38
0
    def parse_item(self, response):
        if response.status == 404:
            self.logger.info("Article '{}' not available anymore.".format(
                response.url))
            return

        def _clean_caption(elem):
            if "–" in elem.text:
                # Caption is of the format "text - credit".
                elem.text = re.sub(r"\s*([^–]*).*", r"\1", elem.text)
                return elem
            else:
                # It's just the "credit", remove it.
                return None

        section = response.css('meta[name="kt:section-path"]::attr("content")'
                               ).extract_first()[1:]  # Skip the first /.
        if section not in self._sections and "all" not in self._sections:
            # Ignore the response as the ressort should not be parsed.
            return

        il = FeedEntryItemLoader(
            response=response,
            parent=response.meta["il"],
            remove_elems=[
                ".ad",
                ".article-paid",
                ".js-overlay-close",
                ".swiper-lazy-preloader",
            ],
            change_tags={".article__lead": "strong"},
            change_attribs={".zoomable__image--zoomed": {
                "data-src": "src"
            }},
            replace_elems={".article__media-caption": _clean_caption},
            base_url="https://www.{}".format(self.name),
        )
        il.add_css(
            "author_name",
            "article .article__author ::text",
            re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL),
        )
        il.add_css("content_html", "article .article__media .zoomable__inner")
        il.add_css("content_html",
                   "article .article__lead")  # change tags to strong
        il.add_css("content_html", "article .article__body")
        if response.css(".article-paid"):
            il.add_value("category", "paywalled")
        il.add_value("category", section.split("/"))
        if "all" in self._sections:
            il.add_value("path", "all")
        if section in self._sections:
            il.add_value("path", section)
        return il.load_item()
Esempio n. 39
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@id="maincontentbook"]'),
         base_url=self.feed_link,
     )
     il.add_xpath("title", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()')
     il.add_value("link", response.url)
     il.add_value("author_name", self.feed_title)
     il.add_xpath("content_html", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("content_html", '//h2[@class="p_book_author"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_publisher"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_isbn"]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[1]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[3]/text()')
     il.add_xpath("content_html", '//div[@class="bookcontent"]//text()')
     il.add_xpath("content_html", '//div[@class="p_book_image"]/img')
     il.add_xpath("content_html", '//span[@style="color:red;"]/b/text()')
     return il.load_item()
Esempio n. 40
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@id="maincontentbook"]'),
         base_url=self.feed_link,
     )
     il.add_xpath("title", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()')
     il.add_value("link", response.url)
     il.add_value("author_name", self.feed_title)
     il.add_xpath("content_html", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("content_html", '//h2[@class="p_book_author"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_publisher"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_isbn"]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[1]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[3]/text()')
     il.add_xpath("content_html", '//div[@class="bookcontent"]//text()')
     il.add_xpath("content_html", '//div[@class="p_book_image"]/img')
     il.add_xpath("content_html", '//span[@style="color:red;"]/b/text()')
     return il.load_item()
Esempio n. 41
0
    def parse_item(self, response):
        if response.status == 404:
            self.logger.info("Article '{}' not available anymore.".format(response.url))
            return

        def _clean_caption(elem):
            if "–" in elem.text:
                # Caption is of the format "text - credit".
                elem.text = re.sub(r"\s*([^–]*).*", r"\1", elem.text)
                return elem
            else:
                # It's just the "credit", remove it.
                return None

        section = response.css(
            'meta[name="kt:section-path"]::attr("content")'
        ).extract_first()[
            1:
        ]  # Skip the first /.
        if section not in self._sections and "all" not in self._sections:
            # Ignore the response as the ressort should not be parsed.
            return

        il = FeedEntryItemLoader(
            response=response,
            parent=response.meta["il"],
            remove_elems=[
                ".ad",
                ".article-paid",
                ".js-overlay-close",
                ".swiper-lazy-preloader",
            ],
            change_tags={".article__lead": "strong"},
            change_attribs={".zoomable__image--zoomed": {"data-src": "src"}},
            replace_elems={".article__media-caption": _clean_caption},
            base_url="https://www.{}".format(self.name),
        )
        il.add_css(
            "author_name",
            "article .article__author ::text",
            re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL),
        )
        il.add_css("content_html", "article .article__media .zoomable__inner")
        il.add_css("content_html", "article .article__lead")  # change tags to strong
        il.add_css("content_html", "article .article__body")
        if response.css(".article-paid"):
            il.add_value("category", "paywalled")
        il.add_value("category", section.split("/"))
        if "all" in self._sections:
            il.add_value("path", "all")
        if section in self._sections:
            il.add_value("path", section)
        return il.load_item()
Esempio n. 42
0
    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url="{}/cms/".format(self.feed_link),
            timezone="Europe/Vienna",
            remove_elems=[".news-latest-date", ".news-single-rightbox", "hr", "h7"],
            remove_elems_xpath=[
                '//div[@class="news-single-item"]/b[1]',
                '//div[@class="news-single-item"]/br[1]',
            ],
            dayfirst=True,
        )

        il.add_value(
            "title", response.xpath("//head/title/text()").re_first(r"::: (.*)")
        )

        il.add_value("link", response.url)

        il.add_value(
            "updated",
            response.xpath('//div[@class="news-single-rightbox"]').re_first(
                r"(\d{2}\.\d{2}\.\d{4})"
            ),
        )

        il.add_value(
            "author_name",
            response.xpath('//head/meta[@name="publisher"]/@content').re_first(
                "recht.at, (.*);"
            ),
        )
        il.add_xpath("author_name", '//head/meta[@name="author"]/@content')
        il.add_value("author_name", self.name)

        il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content')

        il.add_css("content_html", ".news-single-item h7 font strong")
        il.add_css("content_html", ".news-single-item")

        return il.load_item()
Esempio n. 43
0
 def parse_node(self, response, node):
     il = FeedEntryItemLoader(
         response=response, base_url="https://{}".format(self.name), dayfirst=True
     )
     il.add_value("updated", node.xpath("//pubDate/text()").extract_first())
     il.add_value("author_name", node.xpath("//dc:creator/text()").extract_first())
     il.add_value("category", node.xpath("//category/text()").extract())
     title = node.xpath("(//title)[2]/text()").extract()
     if not title:
         # Fallback to the first category if no title is provided (e.g. comic).
         title = node.xpath("//category/text()").extract_first()
     il.add_value("title", title)
     link = node.xpath("(//link)[2]/text()").extract_first()
     il.add_value("link", link)
     if self._steady_token:
         cookies = {"steady-token": self._steady_token}
     else:
         cookies = None
     return scrapy.Request(
         link, self._parse_article, cookies=cookies, meta={"il": il}
     )
Esempio n. 44
0
 def _parse_interview(self, response):
     remove_elems = [
         ".shareable-quote",
         ".share-bar",
         # Remove the last two h2s and all paragraphs below.
         ".interview-body > h2:last-of-type ~ p",
         ".interview-body > h2:last-of-type",
         ".interview-body > h2:last-of-type ~ p",
         ".interview-body > h2:last-of-type",
     ]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_value("link", response.url)
     il.add_css("title", "h1::text")
     il.add_css("author_name", "header .user-link__name::text")
     il.add_css("content_html", ".interview-body")
     il.add_value("updated", response.meta["updated"])
     return il.load_item()
Esempio n. 45
0
 def _parse_episode(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "title",
         '//meta[@name="title"]/@content',
         re=r"(?s)(.*?)(?: vom .*)? - puls4\.com",
     )
     il.add_value(
         "updated",
         "{} {}".format(
             response.xpath('//meta[@name="title"]/@content').re_first(
                 r".*vom (\d{2}\.\d{2}\.\d{4}).*"
             ),
             response.meta["time"] or "00:00",
         ),
     )
     il.add_value(
         "content_html",
         '<img src="{}">'.format(
             response.xpath('//meta[@property="og:image"]/@content').extract_first()
         ),
     )
     il.add_css("content_html", ".player-video-description-intro::text")
     return il.load_item()
Esempio n. 46
0
 def parse_item(self, response):
     remove_elems = [
         "aside",
         "script",
         "h1",
         "source",
         ".breadcrumbs",
         ".author-date",
         ".artikel-social-kommentar",
         ".bild-copyright",
         ".ressortTitleMobile",
         ".article-number",
         ".artikel-kommentarlink",
         ".umfrage-wrapper",
         ".articleIssueInfo",
     ]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_value("link", response.url)
     author_name = (
         response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)") or "Red."
     )
     il.add_value("author_name", author_name)
     il.add_css("title", 'h1[itemprop="headline"]::text')
     il.add_value("updated", response.meta["updated"])
     il.add_css("content_html", "article")
     return il.load_item()
Esempio n. 47
0
    def parse_item(self, response):
        il = FeedEntryItemLoader(response=response, base_url=self._base_url)
        il.add_value("updated", response.meta["updated"])
        il.add_value("author_name", response.meta["author_name"])
        il.add_value("link", response.url)
        il.add_css("title", "title::text", re="(.*) - The Oatmeal")
        il.add_value("category", urlsplit(response.url).path.strip("/").split("/")[0])

        # comics
        il.add_css("content_html", "#comic > img")
        il.add_css("content_html", "#comic > p > img")

        # blog
        il.add_css("content_html", "#blog .center_text img")
        return il.load_item()
Esempio n. 48
0
 def parse_archive_search(self, response):
     for i, item in enumerate(json.loads(response.text)["result"]["hits"]):
         il = FeedEntryItemLoader(
             response=response,
             base_url="https://{}".format(self.name),
             timezone="Europe/Vienna",
         )
         il.add_value("path", "magazine")
         link = response.urljoin(item["detail_link"])
         il.add_value("link", link)
         try:
             author = re.sub(
                 r"(?:.*:|Von)\s*(.*)", r"\1", ", ".join(item["authors"]).title()
             )
             il.add_value("author_name", author)
         except IndexError:
             pass
         il.add_value("title", item["title"])
         # All articles have the same date.
         # We add an offset so they are sorted in the right order.
         date = response.meta["issue_date"] + timedelta(seconds=i)
         il.add_value("updated", date)
         yield scrapy.Request(link, self.parse_item_text, meta={"il": il})
Esempio n. 49
0
 def _parse_user_profile(self, response):
     self._users[response.meta["user_id"]] = (
         response.css("#up_user h2::text").extract_first().strip()
     )
     for posting in response.css(".posting"):
         il = FeedEntryItemLoader(
             selector=posting,
             base_url="https://{}".format(self.name),
             change_tags={"span": "p"},
         )
         il.add_css("title", ".text strong::text")
         il.add_css("link", '.text a::attr("href")')
         il.add_value(
             "updated",
             datetime.utcfromtimestamp(
                 int(posting.css('.date::attr("data-timestamp")').extract_first())
                 / 1000
             ),
         )
         il.add_css("content_html", ".text span")
         il.add_css("content_html", ".article h4")
         il.add_value("path", response.meta["path"])
         yield il.load_item()
Esempio n. 50
0
    def parse_content(self, response):
        parts = self._extract_parts(response)
        il = FeedEntryItemLoader(
            response=response, timezone="Europe/Vienna", dayfirst=True
        )
        il.add_value("path", self._library)
        il.add_value("title", " - ".join(parts[: self._find_first_meta(parts)]))
        il.add_value("link", response.url)
        il.add_xpath("updated", "//td/span/text()", re="In der Bibliothek seit: (.*)")

        _content = ["<ul>"]
        for part in parts:
            _content.append("<li>{}</li>".format(part))
        _content.append("</ul>")
        il.add_value("content_html", "".join(_content))
        return il.load_item()
Esempio n. 51
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@class="main"]'), timezone="Europe/Vienna"
     )
     il.add_xpath("title", "h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("content_html", "h1/following-sibling::*")
     il.add_value("updated", response.url.rstrip("/").split("/")[-1].split("_")[0])
     il.add_value("author_name", self.name)
     return il.load_item()
Esempio n. 52
0
 def parse_node(self, response, node):
     link = node.xpath("link/text()").extract_first()
     il = FeedEntryItemLoader()
     il.add_value("title", node.xpath("title/text()").extract_first())
     il.add_value("updated", node.xpath("pubDate/text()").extract_first())
     il.add_value("category", node.xpath("category/text()").extract())
     return scrapy.Request(
         link,
         self._parse_article,
         cookies={"view": "mobile"},
         meta={"il": il, "path": response.meta["path"], "first_page": True},
     )
Esempio n. 53
0
    def parse(self, response):
        mitteilungsblaetter = response.css(".mitteilungsblaetter")
        updated = mitteilungsblaetter.css("::text").re_first(r"(\d{2}\.\d{2}\.\d{4})")
        link = response.urljoin(
            mitteilungsblaetter.css('a::attr("href")').extract_first()
        )

        response = yield scrapy.Request(link, method="HEAD")
        mb_url = response.url
        match = re.search(
            r"https://tiss.tuwien.ac.at/mbl/blatt_struktur/anzeigen/(\d+)", mb_url
        )
        if not match:
            self.logger.error("No Mitteilungsblätter found!")
            return
        else:
            mb_id = match.group(1)

        url = "https://tiss.{}/api/mbl/v22/id/{}".format(self.name, mb_id)
        response = yield scrapy.Request(url)

        last_entry = None
        for entry in reversed(json.loads(response.text)["knoten"]):
            (entry["main"], entry["sub"]) = re.match(
                r"(\d+)\.?(\d*)", entry["counter"]
            ).groups()
            if last_entry is not None and last_entry["main"] == entry["main"]:
                entry["inhalt"] += "<h2>{}</h2>".format(last_entry["titel"])
                entry["inhalt"] += last_entry["inhalt"]
            if entry["sub"] == "":
                il = FeedEntryItemLoader(
                    base_url="https://tiss.{}".format(self.name),
                    timezone="Europe/Vienna",
                    dayfirst=True,
                )
                il.add_value("updated", updated)
                il.add_value("link", mb_url + "#{}".format(entry["counter"]))
                il.add_value("title", entry["titel"])
                il.add_value("content_html", entry["inhalt"])
                yield il.load_item()
                last_entry = None
            else:
                last_entry = entry
Esempio n. 54
0
    def parse(self, response):
        if len(response.css(".thumbnail")) == 0:
            self.logger.info("No items found.")
            return

        for item in response.css(".thumbnail"):
            il = FeedEntryItemLoader(selector=item, base_url=self._base_url)
            il.add_css("title", ".item_brand_text ::text")
            il.add_css("title", ".item-title ::text")
            il.add_css("title", ".current-price ::text")
            il.add_value(
                "link",
                response.urljoin(item.css(".item-link::attr(href)").extract_first()),
            )
            image_url = item.css(".item-image::attr(data-bg)").re_first(
                r"url\(([^)]+)\)"
            )
            # Fix broken images.
            if image_url.startswith("https://markenankauf.momox.de/pics/https://"):
                image_url = image_url.replace(
                    "https://markenankauf.momox.de/pics/https://", "https://"
                )
            il.add_value("content_html", '<img src="{}">'.format(image_url))
            il.add_css("content_html", ".item-des-container")
            il.add_value("path", response.meta["path"])
            yield il.load_item()

        page = int(response.css(".pagination .active a::text").extract_first())
        if page == 1:
            yield generate_feed_header(
                title=response.css("title ::text").re_first(
                    "(ubup | .*) Second Hand kaufen"
                ),
                subtitle="Deutschlands größter Second Hand-Onlineshop für "
                "Mode & Accessoires",
                icon="https://www.{}/images/favicon.ico".format(self.name),
                link=response.url,
                path=response.meta["path"],
            )
        if page < self._scrape_pages:
            next_page = response.css(
                ".pagination .active + li a::attr(href)"
            ).extract_first()
            if next_page:
                yield scrapy.Request(
                    response.urljoin(next_page),
                    meta={"dont_cache": True, "path": response.meta["path"]},
                )
Esempio n. 55
0
 def parse(self, response):
     # Wiener Linien returns HTML with an XML content type which creates an
     # XmlResponse.
     response = HtmlResponse(url=response.url, body=response.body)
     for item in response.css(".block-news-item"):
         il = FeedEntryItemLoader(
             response=response,
             timezone="Europe/Vienna",
             ignoretz=True,
             base_url="https://www.{}".format(self.name),
         )
         link = response.urljoin(item.css("a::attr(href)").extract_first())
         il.add_value("link", link)
         il.add_value("title", item.css("h3::text").extract_first())
         il.add_value("updated", item.css(".date::text").extract_first())
         yield scrapy.Request(link, self.parse_item, meta={"il": il})
Esempio n. 56
0
 def parse_program(self, response):
     if not response.css(r".jsb_video\/FlashPlayer"):
         return
     data = json.loads(
         response.css(r".jsb_video\/FlashPlayer").xpath("@data-jsb").extract()[0]
     )
     data = data["config"]["initial_video"]["parts"][0]["tracking"]["nurago"]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", data["clipurl"])
     il.add_value("title", data["programname"])
     il.add_value("updated", data["airdate"])
     il.add_xpath("content_html", '//p[@class="plot_summary"]')
     item = il.load_item()
     # Only include videos posted in the last 7 days.
     if item["updated"] + self._timerange > datetime.now(timezone.utc):
         return item
Esempio n. 57
0
    def _parse_article_url(self, response):
        if not response.css("#content"):
            raise DropResponse(
                "Skipping {} since it is empty".format(response.url), transient=True
            )

        if "Fehler" in response.css("h2 ::text").extract_first():
            raise DropResponse(
                "Skipping {} since it returned an error".format(response.url),
                transient=True,
            )

        remove_elems = ['div[style="padding-top:10px;"]']
        il = FeedEntryItemLoader(
            response=response,
            timezone="Europe/Vienna",
            base_url="https://{}".format(self.name),
            dayfirst=True,
            remove_elems=remove_elems,
        )
        il.add_value("link", response.url)
        il.add_value("author_name", "VKI")
        date = response.css(".issue").re_first(
            r"veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})"
        )
        il.add_value("updated", date)
        url = response.xpath('//a[text()="Druckversion"]/@onclick').re_first(
            r"window\.open\('(.*)'\);"
        )
        il.add_css("title", "h1::text")
        if url:
            return scrapy.Request(
                response.urljoin(url), callback=self._parse_article, meta={"il": il}
            )
        else:
            il.add_value("category", "paywalled")
            il.add_css("content_html", ".primary")
            il.add_css("content_html", 'div[style="padding-top:10px;"] > h3')
            return il.load_item()
Esempio n. 58
0
 def parse(self, response):
     articles = json.loads(response.text)
     remove_elems = [
         "hr + p",
         "hr",
         "iframe",
         "p i:last-of-type:contains('Facebook'):contains('Twitter')",
     ]
     for article in articles:
         il = FeedEntryItemLoader(timezone="UTC", remove_elems=remove_elems)
         il.add_value("title", article["title"])
         il.add_value("link", article["url"])
         if "thumbnail_url_1_1" in article:
             il.add_value(
                 "content_html",
                 '<img src="{}">'.format(article["thumbnail_url_1_1"]),
             )
         il.add_value("content_html", article["body"])
         il.add_value(
             "updated", datetime.utcfromtimestamp(article["publish_date"] / 1000)
         )
         il.add_value(
             "author_name",
             [
                 contribution["contributor"]["full_name"]
                 for contribution in article["contributions"]
             ],
         )
         il.add_value("category", article["channel"]["name"])
         for topic in article["topics"] + [article["primary_topic"]]:
             if topic and "name" in topic:
                 il.add_value("category", topic["name"].title())
         if article["nsfw"]:
             il.add_value("category", "nsfw")
         if article["nsfb"]:
             il.add_value("category", "nsfb")
         il.add_value("path", response.meta["locale"])
         yield il.load_item()
Esempio n. 59
0
    def _parse_article(self, response):
        def _fix_img_src(elem):
            src = elem.attrib.pop("data-zoom-src", None)
            # data-zoom-src is only valid if it starts with //images.derstandard.at.
            if src and src.startswith("//images.derstandard.at"):
                elem.attrib["src"] = src
            elem.attrib.pop("width", None)
            elem.attrib.pop("height", None)
            elem.attrib.pop("class", None)
            return elem

        remove_elems = [
            ".credits",
            ".owner-info",
            ".image-zoom",
            ".continue",
            ".sequence-number",
            ".js-embed-output",
            "#mycountrytalks-embed",
            # Remove self-promotion for (other) ressorts.
            '.js-embed-output-feeds a[href^="/r"]',
            '.js-embed-output-feeds a[href^="https://derstandard.at/"]',
            (
                ".js-embed-output-feeds "
                + 'img[src="https://images.derstandard.at/2018/10/18/'
                + 'Immobiliensuche202x122.png"]'
            ),
        ]
        change_tags = {
            "#media-list li .description": "figcaption",
            "#media-list li": "figure",
            "#media-list": "div",
            ".photo": "figure",
            ".caption": "figcaption",
        }
        replace_elems = {
            ".embedded-posting": "<p><em>Hinweis: Das eingebettete Posting ist nur "
            + "im Artikel verfügbar.</em></p>",
            # Replace every special script container with its unescaped content.
            "script.js-embed-template": lambda elem: (
                '<div class="js-embed-output-feeds">'
                + html.unescape(elem.text or "")
                + "</div>"
            ),
            "img": _fix_img_src,
        }
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
            replace_elems=replace_elems,
        )
        il.add_value("link", response.url)
        il.add_css("title", 'meta[property="og:title"]::attr(content)')
        for author in response.css("span.author::text").extract():
            # Sometimes the author name is messed up and written in upper case.
            # This happens usually for articles written by Günter Traxler.
            if author.upper() == author:
                author = author.title()
            il.add_value("author_name", author)
        il.add_value("path", response.meta["ressort"])
        il.add_value("updated", response.meta["updated"])
        il.add_css("category", "#breadcrumb .item a::text")
        blog_id = response.css("#userblogentry::attr(data-objectid)").extract_first()
        if blog_id:
            url = (
                "https://{}/userprofil/bloggingdelivery/blogeintrag?godotid={}"
            ).format(self.name, blog_id)
            return scrapy.Request(url, self._parse_blog_article, meta={"il": il})
        elif response.css("#feature-content"):
            cover_photo = response.css("#feature-cover-photo::attr(style)").re_first(
                r"\((.*)\)"
            )
            il.add_value("content_html", '<img src="{}">'.format(cover_photo))
            il.add_css("content_html", "#feature-cover-title h2")
            il.add_css("content_html", "#feature-content > .copytext")
            return il.load_item()
        else:
            il.add_css("content_html", "#content-aside")
            il.add_css("content_html", "#objectContent > .copytext")
            il.add_css("content_html", "#content-main > .copytext")
            il.add_css("content_html", ".slide")
            return il.load_item()