Beispiel #1
0
 def _parse_episode(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "title",
         '//meta[@name="title"]/@content',
         re=r"(?s)(.*?)(?: vom .*)? - puls4\.com",
     )
     il.add_value(
         "updated",
         "{} {}".format(
             response.xpath('//meta[@name="title"]/@content').re_first(
                 r".*vom (\d{2}\.\d{2}\.\d{4}).*"
             ),
             response.meta["time"] or "00:00",
         ),
     )
     il.add_value(
         "content_html",
         '<img src="{}">'.format(
             response.xpath('//meta[@property="og:image"]/@content').extract_first()
         ),
     )
     il.add_css("content_html", ".player-video-description-intro::text")
     return il.load_item()
Beispiel #2
0
    def _parse_article(self, response):
        if response.status == 410:
            # Articles has been deleted.
            return

        remove_elems = [
            '.bildtext .author', 'iframe',
        ]
        change_tags = {
            'h1': 'h2'
        }
        il = FeedEntryItemLoader(response=response,
                                 timezone=self._timezone,
                                 base_url='https://www.{}'.format(self.name),
                                 remove_elems=remove_elems,
                                 change_tags=change_tags,
                                 dayfirst=False,
                                 yearfirst=False)
        if response.css('.payment'):
            il.add_value('category', 'paywalled')
        il.add_css('link', 'link[rel="canonical"]::attr(href)')
        il.add_css('title', 'meta[property="og:title"]::attr(content)')
        il.add_css('author_name', '.druckheadline::text',
                   re='·\s*(.*)\s*·')
        il.add_css('updated',
                   'meta[http-equiv="last-modified"]::attr(content)')
        il.add_css('content_html', '.druckcontent')
        il.add_value('path', response.meta['ressort'])
        yield il.load_item()
Beispiel #3
0
 def parse_item(self, response):
     remove_elems = [
         "aside",
         "script",
         "h1",
         "source",
         ".breadcrumbs",
         ".author-date",
         ".artikel-social-kommentar",
         ".bild-copyright",
         ".ressortTitleMobile",
         ".article-number",
         ".artikel-kommentarlink",
         ".umfrage-wrapper",
         ".articleIssueInfo",
     ]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_value("link", response.url)
     author_name = (
         response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)")
         or "Red.")
     il.add_value("author_name", author_name)
     il.add_css("title", 'h1[itemprop="headline"]::text')
     il.add_value("updated", response.meta["updated"])
     il.add_css("content_html", "article")
     return il.load_item()
Beispiel #4
0
 def _parse_episode(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url=f"https://{self.name}",
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "title",
         '//meta[@name="title"]/@content',
         re=r"(?s)(.*?)(?: vom .*)? - puls4\.com",
     )
     il.add_value(
         "updated",
         "{} {}".format(
             response.xpath('//meta[@name="title"]/@content').re_first(
                 r".*vom (\d{2}\.\d{2}\.\d{4}).*"),
             response.meta["time"] or "00:00",
         ),
     )
     il.add_value(
         "content_html",
         '<img src="{}">'.format(
             response.xpath(
                 '//meta[@property="og:image"]/@content').extract_first()),
     )
     il.add_css("content_html", ".player-video-description-intro::text")
     return il.load_item()
Beispiel #5
0
    def parse_album(self, response):
        def _replace_track_info(elem):
            parts = list(
                map(lambda x: x.text_content().strip(), elem.getchildren()))
            return '<p>{} <i>({})</i></p>'.format(parts[0], parts[1])

        title = response.xpath('//h1[@class="c-product-block__title"]//text()'
                               ).extract()[-1].strip()
        artist = response.xpath(
            '//div[contains(@class,"c-product-block__contributors")]/p/text()'
        ).re_first('[^,]+')
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}/".format(self.name),
            remove_elems=[
                '.c-product-block__title', '.c-product__product-purchase',
                '.c-track__format-specific-info', '.c-track__duration',
                '.c-track__details', '.c-tracklist__initial-tracks',
                '.c-tabs-block__tabs-links', 'button'
            ],
            replace_elems={'.c-track__all-format-info': _replace_track_info})
        il.add_value("title", '{} - {}'.format(artist, title))
        il.add_value("link", response.url)
        il.add_value("author_name", 'bot')
        il.add_css("content_html", 'div.c-page--product')
        return il.load_item()
Beispiel #6
0
    def parse_item(self, response):
        author_date = " ".join(response.css(".author-date ::text").extract())
        match = re.search(r"von\s+(.*)", author_date)
        author_name = match.group(1) if match else "Red."

        remove_elems = [
            "aside",
            "script",
            "h1",
            "source",
            ".breadcrumbs",
            ".author-date",
            ".artikel-social-kommentar",
            ".bild-copyright",
            ".ressortTitleMobile",
            ".article-number",
            ".artikel-kommentarlink",
            ".umfrage-wrapper",
            ".articleIssueInfo",
            "hr",
            "center div[style='padding: 10px; background:#efefef']",
        ]
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
        )
        il.add_value("link", response.url)
        il.add_value("author_name", author_name)
        il.add_css("title", 'h1[itemprop="headline"]::text')
        il.add_value("updated", response.meta["updated"])
        il.add_css("content_html", "article")
        return il.load_item()
    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url='{}/cms/'.format(self._link),
            timezone=self._timezone,
            remove_elems=['.news-latest-date', '.news-single-rightbox', 'hr',
                          'h7'],
            remove_elems_xpath=['//div[@class="news-single-item"]/b[1]',
                                '//div[@class="news-single-item"]/br[1]'],
        )

        il.add_value(
            'title',
            response.xpath('//head/title/text()').re_first(r'::: (.*)'))

        il.add_value('link', response.url)

        il.add_value(
            'updated',
            response.xpath('//div[@class="news-single-rightbox"]').
            re_first(r'(\d{2}\.\d{2}\.\d{4})'))

        il.add_value(
            'author_name',
            response.xpath('//head/meta[@name="publisher"]/@content').
            re_first('recht.at, (.*);'))
        il.add_xpath('author_name', '//head/meta[@name="author"]/@content')
        il.add_value('author_name', self.name)

        il.add_xpath('author_email', '//head/meta[@name="reply-to"]/@content')

        il.add_css('content_html', '.news-single-item h7 font strong')
        il.add_css('content_html', '.news-single-item')

        yield il.load_item()
Beispiel #8
0
 def parse_item(self, response):
     remove_elems = [
         "aside",
         "script",
         "h1",
         "source",
         ".breadcrumbs",
         ".author-date",
         ".artikel-social-kommentar",
         ".bild-copyright",
         ".ressortTitleMobile",
         ".article-number",
         ".artikel-kommentarlink",
         ".umfrage-wrapper",
         ".articleIssueInfo",
     ]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_value("link", response.url)
     author_name = (
         response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)") or "Red."
     )
     il.add_value("author_name", author_name)
     il.add_css("title", 'h1[itemprop="headline"]::text')
     il.add_value("updated", response.meta["updated"])
     il.add_css("content_html", "article")
     return il.load_item()
Beispiel #9
0
    def _parse_article(self, response):
        if response.status == 410:
            # Articles has been deleted.
            return

        remove_elems = [".bildtext .author", "iframe"]
        change_tags = {"h1": "h2", ".bildbox": "figure", ".bildtext": "figcaption"}
        il = FeedEntryItemLoader(
            response=response,
            timezone="Europe/Vienna",
            base_url="https://www.{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
            dayfirst=True,
            yearfirst=False,
        )
        if response.css(".payment"):
            il.add_value("category", "paywalled")
        il.add_css("link", 'link[rel="canonical"]::attr(href)')
        il.add_css("title", 'meta[property="og:title"]::attr(content)')
        il.add_css("author_name", ".druckheadline::text", re=r"·\s*(.*)\s*·")
        # Mon, 01 Oct 18 13:42:45 +0200
        il.add_css("updated", 'meta[http-equiv="last-modified"]::attr(content)')
        il.add_css("content_html", ".druckcontent")
        il.add_value("path", response.meta["ressort"])
        return il.load_item()
Beispiel #10
0
 def _parse_article(self, response):
     remove_elems = ['iframe', 'script']
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta['il'],
                              remove_elems=remove_elems,
                              base_url='http://{}'.format(self.name))
     il.add_css('content_html', '.entry-content')
     return il.load_item()
Beispiel #11
0
 def _parse_article(self, response):
     remove_elems = ['#issue', 'h1', '#slogan', '#logo', '#footer']
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta['il'],
                              base_url='http://{}'.format(self.name),
                              remove_elems=remove_elems)
     il.add_css('content_html', '#page')
     yield il.load_item()
Beispiel #12
0
 def _parse_article(self, response):
     remove_elems = ["#issue", "h1", "#slogan", "#logo", "#footer"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_css("content_html", "#page")
     return il.load_item()
Beispiel #13
0
 def _parse_article(self, response):
     remove_elems = ["#issue", "h1", "#slogan", "#logo", "#footer"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         base_url=f"https://{self.name}",
         remove_elems=remove_elems,
     )
     il.add_css("content_html", "#page")
     return il.load_item()
Beispiel #14
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url="{}/".format(self.feed_link),
         timezone="Europe/Vienna",
         dayfirst=True,
         remove_elems=[".ruler", "h1"],
     )
     il.add_css("title", "h1.event-title::text")
     il.add_value("link", response.url)
     il.add_css("content_html", "div#content.container")
     return il.load_item()
Beispiel #15
0
 def parse_item_text(self, response):
     remove_elems = [".ad-component", ".wp-caption-text"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         base_url="https://{}".format(self.name),
     )
     if response.css(".bluebox"):
         il.add_value("category", "paywalled")
     il.add_css("content_html", "div.pR")
     return il.load_item()
Beispiel #16
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url="{}/".format(self.feed_link),
         timezone="Europe/Vienna",
         dayfirst=True,
         remove_elems=[".ruler", "h1"],
     )
     il.add_css("title", "h1.event-title::text")
     il.add_value("link", response.url)
     il.add_css("content_html", "div#content.container")
     return il.load_item()
Beispiel #17
0
    def _parse_weekly_edition(self, response):
        remove_elems = ["h1"]
        change_tags = {
            ".Cat1HL": "h1",
            ".Cat2HL": "h2",
            ".Cat3HL": "h3",
            ".SummaryHL": "h4",
        }
        il = FeedEntryItemLoader(
            response=response,
            parent=response.meta["il"],
            change_tags=change_tags,
            remove_elems=remove_elems,
            base_url=f"https://{self.name}",
        )

        for url in response.css("h2.SummaryHL a::attr(href)").extract():
            yield scrapy.Request(
                response.urljoin(url),
                self._parse_article,
                meta={
                    "il": None,
                    "updated": response.meta["updated"]
                },
            )

        # Remove articles that have their own page.
        text = []
        in_article = False
        for line in response.css(".ArticleText").extract_first().splitlines(
                True):
            # Beginning of article.
            if '<h2 class="SummaryHL"><a href="/Articles/' in line:
                in_article = True
            if not in_article:
                text.append(line)
            # End of article. Note that the links to the comments doesn't
            # always include "#comments" so we can't check for that.
            if '">Comments (' in line:
                in_article = False
        text = "".join(text)

        # Remove page editor.
        text = re.sub(r"<b>Page editor</b>: .*", "", text)

        # Recursively remove headings with no content.
        text = _remove_empty_headings(text)

        il.add_css("title", "h1::text")
        il.add_value("content_html", text)
        il.add_value("link", response.url)
        yield il.load_item()
Beispiel #18
0
    def _parse_article(self, response):
        title = response.css('meta[property="og:title"]::attr(content)').extract_first()
        if not title:
            raise DropResponse(
                "Skipping {} because ran into bot detection".format(response.url),
                transient=True,
            )

        remove_elems = [
            "meta",
            ".ds-share-list",
            ".advert",
            ".layout-article-links",
            ".ds-chapter-list",
            ".layout-article-meta",
        ]
        change_tags = {
            ".article__lead-image": "figure",
            ".article__description": "h2",
            ".article__footnote": "i",
        }
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
        )
        il.add_value("link", response.url)
        il.add_value("title", title)
        il.add_css("updated", "time.article__dateline-datetime::attr('datetime')")
        il.add_css("content_html", ".article__lead-image")
        il.add_css("content_html", ".article__description")
        il.add_css("content_html", ".layout-article-body")
        il.add_value("path", response.meta["ressort"])
        return il.load_item()
Beispiel #19
0
    def _parse_article(self, response):
        def _fix_img_src(elem):
            if "data-original" in elem.attrib:
                elem.attrib["src"] = elem.attrib["data-original"]
            return elem

        remove_elems = [
            ".credit",
            ".hide-caption",
            ".toggle-caption",
            ".enlarge-options",
            ".enlarge_measure",
            ".enlarge_html",
            ".ad-backstage",
            'p:first-of-type:contains("Editor\'s Note: This is an excerpt of")',
            'p:contains("Did you enjoy this newsletter segment?")',
        ]
        replace_elems = {"img": _fix_img_src}
        change_tags = {".image": "figure", ".credit-caption": "figcaption"}

        il = FeedEntryItemLoader(
            response=response,
            base_url=self._base_url,
            remove_elems=remove_elems,
            replace_elems=replace_elems,
            change_tags=change_tags,
        )
        il.add_css("title", "h1 ::text")
        il.add_value("link", response.url)
        il.add_css("content_html", "#storytext")
        il.add_value("path", response.meta["path"])
        il.add_css("updated", '.dateblock time::attr("datetime")')
        il.add_css("author_name", ".byline__name a::text")

        yield il.load_item()
Beispiel #20
0
    def _parse_weekly_edition(self, response):
        remove_elems = ['h1']
        change_tags = {
            '.Cat1HL': 'h1',
            '.Cat2HL': 'h2',
            '.Cat3HL': 'h3',
            '.SummaryHL': 'h4',
        }
        il = FeedEntryItemLoader(response=response,
                                 parent=response.meta['il'],
                                 change_tags=change_tags,
                                 remove_elems=remove_elems,
                                 base_url='https://{}'.format(self.name))

        for url in response.css('h2.SummaryHL a::attr(href)').extract():
            yield scrapy.Request(response.urljoin(url),
                                 self._parse_article,
                                 meta={
                                     'il': None,
                                     'updated': response.meta['updated']
                                 })

        # Remove articles that have their own page.
        text = []
        in_article = False
        for line in (
                response.css('.ArticleText').extract_first().splitlines(True)):
            # Beginning of article.
            if '<h2 class="SummaryHL"><a href="/Articles/' in line:
                in_article = True
            if not in_article:
                text.append(line)
            # End of article. Note that the links to the comments doesn't
            # always include "#comments" so we can't check for that.
            if '">Comments (' in line:
                in_article = False
        text = ''.join(text)

        # Remove page editor.
        text = re.sub(r'<b>Page editor</b>: .*', '', text)

        # Recursively remove headings with no content.
        text = _remove_empty_headings(text)

        il.add_css('title', 'h1::text')
        il.add_value('content_html', text)
        il.add_value('link', response.url)
        yield il.load_item()
Beispiel #21
0
 def _parse_article(self, response):
     remove_elems = ["script"]
     convert_footnotes = [".footnoteContent"]
     pullup_elems = {".footnoteContent": 1}
     change_tags = {".entry-content-info-box": "blockquote"}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         change_tags=change_tags,
         base_url="https://{}".format(self.name),
         convert_footnotes=convert_footnotes,
         pullup_elems=pullup_elems,
     )
     il.add_css("content_html", ".entry-content")
     return il.load_item()
Beispiel #22
0
 def _parse_article(self, response):
     remove_elems = ["script"]
     convert_footnotes = [".footnoteContent"]
     pullup_elems = {".footnoteContent": 1}
     change_tags = {".entry-content-info-box": "blockquote"}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         change_tags=change_tags,
         base_url="https://{}".format(self.name),
         convert_footnotes=convert_footnotes,
         pullup_elems=pullup_elems,
     )
     il.add_css("content_html", ".entry-content")
     return il.load_item()
Beispiel #23
0
 def parse_letter(self, response):
     account = response.meta["account"]
     il = FeedEntryItemLoader(response=response, base_url=self._links.get(account))
     il.add_value("path", account)
     il.add_value("link", response.url)
     il.add_css("title", "title::text")
     il.add_css("author_name", "div#message-heading div.by-line a::text")
     il.add_css("updated", "div#message-heading div.date::text")
     il.add_css("content_html", "div.message-body")
     yield il.load_item()
Beispiel #24
0
 def _parse_item(self, response):
     remove_elems = [
         "h1",
         ".nono",
         ".acceptance_org",
         ".state",
         "script",
         ".gentics-portletreload-position-notvisibleposition",
     ]
     remove_elems_xpath = [
         """
         //div[
             @class='advice' and
             child::div[@class='advice_text' and (
                 contains(., 'nicht die aktuelle Rechtslage') or
                 contains(., 'wird nicht laufend aktualisiert') or
                 contains(., 'Übersicht über bisherige "Themen des Monats"')
             )]
         ]
         """,
         # Remove table of contents.
         "//li[child::a[starts-with(@href, '#')]]",
         "//ul[not(li)]",
     ]
     change_tags = {"abbr": "span"}
     il = FeedEntryItemLoader(
         response=response,
         timezone="Europe/Vienna",
         base_url="https://www.{}".format(self.name),
         remove_elems=remove_elems,
         remove_elems_xpath=remove_elems_xpath,
         change_tags=change_tags,
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "author_name",
         '//div[@class="acceptance_org"]/text()[preceding-sibling::br]',
     )
     il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)")
     il.add_value(
         "updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})")
     )
     il.add_css("content_html", ".Content")
     return il.load_item()
Beispiel #25
0
 def _parse_episode(self, response):
     il = FeedEntryItemLoader(response=response,
                              base_url='http://{}'.format(self.name),
                              timezone=self._timezone,
                              dayfirst=True)
     il.add_value('link', response.url)
     il.add_xpath('title', '//meta[@name="title"]/@content',
                  re='(?s)(.*?)(?: vom .*)? - puls4\.com')
     il.add_value('updated', '{} {}'.format(
         response.xpath('//meta[@name="title"]/@content').
         re_first(r'.*vom (\d{2}\.\d{2}\.\d{4}).*'),
         response.meta['time'] or '00:00')
     )
     il.add_value('content_html', '<img src="{}">'.format(
         response.xpath('//meta[@property="og:image"]/@content').
         extract_first()))
     il.add_css('content_html', '.player-video-description-intro::text')
     yield il.load_item()
Beispiel #26
0
    def parse_item(self, response):
        if response.status == 404:
            self.logger.info("Article '{}' not available anymore.".format(
                response.url))
            return

        def _clean_caption(elem):
            if "–" in elem.text:
                # Caption is of the format "text - credit".
                elem.text = re.sub(r"\s*([^–]*).*", r"\1", elem.text)
                return elem
            else:
                # It's just the "credit", remove it.
                return None

        section = response.css('meta[name="kt:section-path"]::attr("content")'
                               ).extract_first()[1:]  # Skip the first /.
        if section not in self._sections and "all" not in self._sections:
            # Ignore the response as the ressort should not be parsed.
            return

        il = FeedEntryItemLoader(
            response=response,
            parent=response.meta["il"],
            remove_elems=[
                ".ad",
                ".article-paid",
                ".js-overlay-close",
                ".swiper-lazy-preloader",
            ],
            change_tags={".article__lead": "strong"},
            change_attribs={".zoomable__image--zoomed": {
                "data-src": "src"
            }},
            replace_elems={".article__media-caption": _clean_caption},
            base_url="https://www.{}".format(self.name),
        )
        il.add_css(
            "author_name",
            "article .article__author ::text",
            re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL),
        )
        il.add_css("content_html", "article .article__media .zoomable__inner")
        il.add_css("content_html",
                   "article .article__lead")  # change tags to strong
        il.add_css("content_html", "article .article__body")
        if response.css(".article-paid"):
            il.add_value("category", "paywalled")
        il.add_value("category", section.split("/"))
        if "all" in self._sections:
            il.add_value("path", "all")
        if section in self._sections:
            il.add_value("path", section)
        return il.load_item()
Beispiel #27
0
 def _parse_item(self, response):
     remove_elems = [
         "h1",
         ".nono",
         ".acceptance_org",
         ".state",
         "script",
         ".gentics-portletreload-position-notvisibleposition",
     ]
     remove_elems_xpath = [
         """
         //div[
             @class='advice' and
             child::div[@class='advice_text' and (
                 contains(., 'nicht die aktuelle Rechtslage') or
                 contains(., 'wird nicht laufend aktualisiert') or
                 contains(., 'Übersicht über bisherige "Themen des Monats"')
             )]
         ]
         """,
         # Remove table of contents.
         "//li[child::a[starts-with(@href, '#')]]",
         "//ul[not(li)]",
     ]
     change_tags = {"abbr": "span"}
     il = FeedEntryItemLoader(
         response=response,
         timezone="Europe/Vienna",
         base_url="https://www.{}".format(self.name),
         remove_elems=remove_elems,
         remove_elems_xpath=remove_elems_xpath,
         change_tags=change_tags,
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "author_name",
         '//div[@class="acceptance_org"]/text()[preceding-sibling::br]',
     )
     il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)")
     il.add_value("updated",
                  response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})"))
     il.add_css("content_html", ".Content")
     return il.load_item()
 def _parse_article(self, response):
     remove_elems = ["h1", "#contents", ".headerlink"]
     change_tags = {".admonition-title": "h2"}
     il = FeedEntryItemLoader(
         response=response,
         base_url=response.url,
         remove_elems=remove_elems,
         change_tags=change_tags,
     )
     il.add_value("link", response.url)
     il.add_value("author_name", "Brandon Rhodes")
     # Use "Last-Modified" field or fall back to "Date".
     updated = (
         response.headers.get("Last-Modified", response.headers.get("Date"))
     ).decode("ascii")
     il.add_value("updated", updated)
     il.add_css("title", "title::text")
     il.add_css("content_html", ".section")
     return il.load_item()
Beispiel #29
0
 def _parse_article(self, response):
     remove_elems = [
         ".FeatureByline",
         ".GAByline",
         ".Form",
         "form",
         ".MakeALink",
         "br",
     ]
     change_tags = {"div.BigQuote": "blockquote"}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         change_tags=change_tags,
         base_url=f"https://{self.name}",
     )
     text = response.css(".ArticleText").extract_first()
     # Remove 'Log in to post comments'.
     text = re.sub(r'<hr width="60%" align="left">.*to post comments\)',
                   "",
                   text,
                   flags=re.S)
     il.add_css("title", "h1::text")
     il.add_value("content_html", text)
     il.add_css("author_name", ".FeatureByline b ::text")
     il.add_css("author_name", ".GAByline a ::text")
     il.add_css(
         "author_name",
         ".GAByline p ::text",
         re="This article was contributed by (.*)",
     )
     il.add_xpath(
         "updated",
         '//div[@class="FeatureByline"]/text()[preceding-sibling::br]',
         TakeFirst(),
     )
     il.add_xpath("updated", '//div[@class="GAByline"]/p[1]/text()')
     # Last resort if date cannot be extracted and it's a weekly edition.
     if "updated" in response.meta:
         il.add_value("updated", response.meta["updated"])
     if response.css(".MakeALink"):
         # Get subscriber link for paywalled content.
         return scrapy.FormRequest.from_response(
             response,
             formcss=".MakeALink form",
             callback=self._subscriber_link,
             meta={"il": il},
         )
     else:
         il.add_value("link", response.url)
         return il.load_item()
Beispiel #30
0
    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url="{}/cms/".format(self.feed_link),
            timezone="Europe/Vienna",
            remove_elems=[".news-latest-date", ".news-single-rightbox", "hr", "h7"],
            remove_elems_xpath=[
                '//div[@class="news-single-item"]/b[1]',
                '//div[@class="news-single-item"]/br[1]',
            ],
            dayfirst=True,
        )

        il.add_value(
            "title", response.xpath("//head/title/text()").re_first(r"::: (.*)")
        )

        il.add_value("link", response.url)

        il.add_value(
            "updated",
            response.xpath('//div[@class="news-single-rightbox"]').re_first(
                r"(\d{2}\.\d{2}\.\d{4})"
            ),
        )

        il.add_value(
            "author_name",
            response.xpath('//head/meta[@name="publisher"]/@content').re_first(
                "recht.at, (.*);"
            ),
        )
        il.add_xpath("author_name", '//head/meta[@name="author"]/@content')
        il.add_value("author_name", self.name)

        il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content')

        il.add_css("content_html", ".news-single-item h7 font strong")
        il.add_css("content_html", ".news-single-item")

        return il.load_item()
    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url="{}/cms/".format(self.feed_link),
            timezone="Europe/Vienna",
            remove_elems=[
                ".news-latest-date", ".news-single-rightbox", "hr", "h7"
            ],
            remove_elems_xpath=[
                '//div[@class="news-single-item"]/b[1]',
                '//div[@class="news-single-item"]/br[1]',
            ],
            dayfirst=True,
        )

        il.add_value(
            "title",
            response.xpath("//head/title/text()").re_first(r"::: (.*)"))

        il.add_value("link", response.url)

        il.add_value(
            "updated",
            response.xpath('//div[@class="news-single-rightbox"]').re_first(
                r"(\d{2}\.\d{2}\.\d{4})"),
        )

        il.add_value(
            "author_name",
            response.xpath('//head/meta[@name="publisher"]/@content').re_first(
                "recht.at, (.*);"),
        )
        il.add_xpath("author_name", '//head/meta[@name="author"]/@content')
        il.add_value("author_name", self.name)

        il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content')

        il.add_css("content_html", ".news-single-item h7 font strong")
        il.add_css("content_html", ".news-single-item")

        return il.load_item()
Beispiel #32
0
    def parse_article(self, response):
        site = response.meta["site"]
        il = FeedEntryItemLoader(
            response=response,
            timezone="Europe/Vienna",
            base_url=self._links[site],
            dayfirst=True,
            yearfirst=False,
            remove_elems=[
                "div.main-content h1:first-of-type",
                "p#ctl00_ctl00_ctl00_cph_col_a_cph_content_cph_content_detail_p_date",
                "div#main-content-header",
            ],
        )
        il.add_value("path", site)
        il.add_value("link", response.url)
        il.add_value("updated", response.meta["updated"])
        il.add_css("title", "div.main-content h1:first-of-type::text")
        il.add_css("content_html", "div.main-content")

        yield il.load_item()
Beispiel #33
0
 def _parse_article(self, response):
     remove_elems = [
         ".caption-credit",
         ".gallery-image-credit",
         "#social-left",
         "ul.toc",
         "h3:contains('Table of Contents')",
         "br",
         ".sidebar:contains('Further Reading')",
         ".credit",
     ]
     change_tags = {".sidebar": "blockquote", "aside": "blockquote"}
     replace_elems = {"div.image": self._div_to_img}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         replace_elems=replace_elems,
         change_tags=change_tags,
     )
     if response.meta.get("first_page", False):
         il.add_value("link", response.url)
         il.add_css("author_name", ".byline a span ::text")
         il.add_css("content_html", "header h2")
         il.add_value("path", response.meta["path"])
     il.add_css("content_html", ".article-content")
     if response.css(".next"):
         return scrapy.Request(
             response.css(".numbers a::attr(href)").extract()[-1],
             self._parse_article,
             meta={"il": il, "path": response.meta["path"]},
         )
     else:
         return il.load_item()
Beispiel #34
0
    def _parse_video_page(self, response):
        match = re.search(
            r"https?://(?:www\.)?servustv\.com/videos/(?P<id>[aA]{2}-\w+|\d+-\d+)",
            response.url,
        )
        if not match:
            return
        video_id = match.group("id").upper()

        il = FeedEntryItemLoader(response=response)
        il.add_value("link", response.url)
        section = response.css(
            "meta[property='article:section']::attr('content')").extract_first(
            )
        if section != "Allgemein":
            il.add_value("title", section)
        il.add_css("title", "title::text", re="(.*) - Servus TV")
        image_url = response.css(
            "meta[property='og:image']::attr('content')").extract_first()
        il.add_value("content_html", '<img src="{}">'.format(image_url))
        il.add_css("content_html",
                   "meta[property='og:description']::attr('content')")
        il.add_css("content_html", "#media-asset-content-container")

        match = re.search(r'"dateModified":\s*"([^"]+)"', response.text)
        if match:
            il.add_value("updated", match.group(1))

        stream_url = "https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8" % video_id

        yield Request(stream_url, self._parse_stream, meta={"il": il})
Beispiel #35
0
 def _parse_item(self, response):
     remove_elems = [
         'h1', '.nono', '.acceptance_org', '.state', 'script',
         '.gentics-portletreload-position-notvisibleposition'
     ]
     remove_elems_xpath = [
         """
         //div[
             @class='advice' and
             child::div[@class='advice_text' and (
                 contains(., 'nicht die aktuelle Rechtslage') or
                 contains(., 'wird nicht laufend aktualisiert') or
                 contains(., 'Übersicht über bisherige "Themen des Monats"')
             )]
         ]
         """,
         # Remove table of contents.
         "//li[child::a[starts-with(@href, '#')]]",
         "//ul[not(li)]",
     ]
     change_tags = {
         'abbr': 'span',
     }
     il = FeedEntryItemLoader(response=response,
                              timezone=self._timezone,
                              base_url='https://www.{}'.format(self.name),
                              remove_elems=remove_elems,
                              remove_elems_xpath=remove_elems_xpath,
                              change_tags=change_tags,
                              dayfirst=True)
     il.add_value('link', response.url)
     il.add_xpath(
         'author_name',
         '//div[@class="acceptance_org"]/text()[preceding-sibling::br]',
     )
     il.add_css('title', 'title::text', re=r'HELP.gv.at:\s*(.*)')
     il.add_value('updated',
                  response.css('.state').re_first(r'(\d{2}\.\d{2}\.\d{4})'))
     il.add_css('content_html', '.Content')
     yield il.load_item()
Beispiel #36
0
    def _parse_article_url(self, response):
        if 'Fehler' in response.css('h2 ::text').extract_first():
            self.logger.info('Skipping {} as it returned an error'.format(
                response.url))
            return

        remove_elems = ['div[style="padding-top:10px;"]']
        il = FeedEntryItemLoader(response=response,
                                 timezone=self._timezone,
                                 base_url='http://{}'.format(self.name),
                                 dayfirst=True,
                                 remove_elems=remove_elems)
        il.add_value('link', response.url)
        il.add_value('author_name', 'VKI')
        date = response.css('.issue').re_first(
            'veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})')
        il.add_value('updated', date)
        url = (response.xpath('//a[text()="Druckversion"]/@onclick').re_first(
            r"window\.open\('(.*)'\);"))
        il.add_css('title', 'h1::text')
        if url:
            yield scrapy.Request(response.urljoin(url),
                                 callback=self._parse_article,
                                 meta={'il': il})
        else:
            il.add_value('category', 'paywalled')
            il.add_css('content_html', '.primary')
            il.add_css('content_html', 'div[style="padding-top:10px;"] > h3')
            yield il.load_item()
Beispiel #37
0
 def parse_item(self, response):
     remove_elems = [
         'aside',
         'script',
         'h1',
         '.breadcrumbs',
         '.author-date',
         '.artikel-social-kommentar',
         '.bild-copyright',
         '.ressortTitleMobile',
         '.article-number',
         '.artikel-kommentarlink',
         '.umfrage-wrapper',
         '.articleIssueInfo',
     ]
     il = FeedEntryItemLoader(response=response,
                              timezone=self._timezone,
                              base_url='http://{}'.format(self.name),
                              remove_elems=remove_elems)
     il.add_value('link', response.url)
     author_name = (
         response.css('.author-date ::text').re(r'(?:Von)?\s*(\w+ \w+)')
         or 'Red.')
     il.add_value('author_name', author_name)
     il.add_css('title', 'h1[itemprop="headline"]::text')
     il.add_css('updated',
                'meta[property="article:published_time"]::attr(content)',
                re='([^+]*)')
     il.add_css('content_html', 'article')
     yield il.load_item()
Beispiel #38
0
 def parse_login_issue(self, response):
     remove_elems = [
         '.field-name-field-file-access',
         '.field-name-field-login-issue-file',
         '.field-name-field-product',
         '.field-commerce-price',
         '.views-field-field-file-access',
         '.view-header',
     ]
     il = FeedEntryItemLoader(response=response,
                              base_url='https://www.{}'.format(self.name),
                              remove_elems=remove_elems,
                              dayfirst=True)
     il.add_value('link', response.url)
     title = response.css('h1::text').extract_first().strip()
     il.add_value('title', title)
     il.add_value('updated', self._date_from_title(title))
     il.add_css('content_html', '.content-wrapper')
     il.add_value('path', 'login')
     if response.css('.usenix-files-protected'):
         il.add_value('category', 'paywalled')
     yield il.load_item()
Beispiel #39
0
    def parse_item(self, response):
        il = FeedEntryItemLoader(response=response, base_url=self._base_url)
        il.add_value("updated", response.meta["updated"])
        il.add_value("author_name", response.meta["author_name"])
        il.add_value("link", response.url)
        il.add_css("title", "title::text", re="(.*) - The Oatmeal")
        il.add_value("category", urlsplit(response.url).path.strip("/").split("/")[0])

        # comics
        il.add_css("content_html", "#comic > img")
        il.add_css("content_html", "#comic > p > img")

        # blog
        il.add_css("content_html", "#blog .center_text img")
        return il.load_item()
Beispiel #40
0
    def parse(self, response):
        if len(response.css(".thumbnail")) == 0:
            self.logger.info("No items found.")
            return

        for item in response.css(".thumbnail"):
            il = FeedEntryItemLoader(selector=item, base_url=self._base_url)
            il.add_css("title", ".item_brand_text ::text")
            il.add_css("title", ".item-title ::text")
            il.add_css("title", ".current-price ::text")
            il.add_value(
                "link",
                response.urljoin(item.css(".item-link::attr(href)").extract_first()),
            )
            image_url = item.css(".item-image::attr(data-bg)").re_first(
                r"url\(([^)]+)\)"
            )
            # Fix broken images.
            if image_url.startswith("https://markenankauf.momox.de/pics/https://"):
                image_url = image_url.replace(
                    "https://markenankauf.momox.de/pics/https://", "https://"
                )
            il.add_value("content_html", '<img src="{}">'.format(image_url))
            il.add_css("content_html", ".item-des-container")
            il.add_value("path", response.meta["path"])
            yield il.load_item()

        page = int(response.css(".pagination .active a::text").extract_first())
        if page == 1:
            yield generate_feed_header(
                title=response.css("title ::text").re_first(
                    "(ubup | .*) Second Hand kaufen"
                ),
                subtitle="Deutschlands größter Second Hand-Onlineshop für "
                "Mode & Accessoires",
                icon="https://www.{}/images/favicon.ico".format(self.name),
                link=response.url,
                path=response.meta["path"],
            )
        if page < self._scrape_pages:
            next_page = response.css(
                ".pagination .active + li a::attr(href)"
            ).extract_first()
            if next_page:
                yield scrapy.Request(
                    response.urljoin(next_page),
                    meta={"dont_cache": True, "path": response.meta["path"]},
                )
Beispiel #41
0
 def _parse_restaurant(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url=response.url,
         parent=response.meta["il"],
         remove_elems=[".external"],
     )
     il.add_css("content_html", ".content .right p")
     il.add_css("content_html", ".restaurant-link")
     il.add_css("category", ".tags a ::text")
     yield il.load_item()
Beispiel #42
0
    def parse_item(self, response):
        if response.status == 404:
            self.logger.info("Article '{}' not available anymore.".format(response.url))
            return

        def _clean_caption(elem):
            if "–" in elem.text:
                # Caption is of the format "text - credit".
                elem.text = re.sub(r"\s*([^–]*).*", r"\1", elem.text)
                return elem
            else:
                # It's just the "credit", remove it.
                return None

        section = response.css(
            'meta[name="kt:section-path"]::attr("content")'
        ).extract_first()[
            1:
        ]  # Skip the first /.
        if section not in self._sections and "all" not in self._sections:
            # Ignore the response as the ressort should not be parsed.
            return

        il = FeedEntryItemLoader(
            response=response,
            parent=response.meta["il"],
            remove_elems=[
                ".ad",
                ".article-paid",
                ".js-overlay-close",
                ".swiper-lazy-preloader",
            ],
            change_tags={".article__lead": "strong"},
            change_attribs={".zoomable__image--zoomed": {"data-src": "src"}},
            replace_elems={".article__media-caption": _clean_caption},
            base_url="https://www.{}".format(self.name),
        )
        il.add_css(
            "author_name",
            "article .article__author ::text",
            re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL),
        )
        il.add_css("content_html", "article .article__media .zoomable__inner")
        il.add_css("content_html", "article .article__lead")  # change tags to strong
        il.add_css("content_html", "article .article__body")
        if response.css(".article-paid"):
            il.add_value("category", "paywalled")
        il.add_value("category", section.split("/"))
        if "all" in self._sections:
            il.add_value("path", "all")
        if section in self._sections:
            il.add_value("path", section)
        return il.load_item()
Beispiel #43
0
 def _parse_user_profile(self, response):
     self._users[response.meta["user_id"]] = (
         response.css("#up_user h2::text").extract_first().strip()
     )
     for posting in response.css(".posting"):
         il = FeedEntryItemLoader(
             selector=posting,
             base_url="https://{}".format(self.name),
             change_tags={"span": "p"},
         )
         il.add_css("title", ".text strong::text")
         il.add_css("link", '.text a::attr("href")')
         il.add_value(
             "updated",
             datetime.utcfromtimestamp(
                 int(posting.css('.date::attr("data-timestamp")').extract_first())
                 / 1000
             ),
         )
         il.add_css("content_html", ".text span")
         il.add_css("content_html", ".article h4")
         il.add_value("path", response.meta["path"])
         yield il.load_item()
Beispiel #44
0
 def _parse_interview(self, response):
     remove_elems = [
         ".shareable-quote",
         ".share-bar",
         # Remove the last two h2s and all paragraphs below.
         ".interview-body > h2:last-of-type ~ p",
         ".interview-body > h2:last-of-type",
         ".interview-body > h2:last-of-type ~ p",
         ".interview-body > h2:last-of-type",
     ]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_value("link", response.url)
     il.add_css("title", "h1::text")
     il.add_css("author_name", "header .user-link__name::text")
     il.add_css("content_html", ".interview-body")
     il.add_value("updated", response.meta["updated"])
     return il.load_item()
Beispiel #45
0
    def _parse_article_url(self, response):
        if not response.css("#content"):
            raise DropResponse(
                "Skipping {} since it is empty".format(response.url), transient=True
            )

        if "Fehler" in response.css("h2 ::text").extract_first():
            raise DropResponse(
                "Skipping {} since it returned an error".format(response.url),
                transient=True,
            )

        remove_elems = ['div[style="padding-top:10px;"]']
        il = FeedEntryItemLoader(
            response=response,
            timezone="Europe/Vienna",
            base_url="https://{}".format(self.name),
            dayfirst=True,
            remove_elems=remove_elems,
        )
        il.add_value("link", response.url)
        il.add_value("author_name", "VKI")
        date = response.css(".issue").re_first(
            r"veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})"
        )
        il.add_value("updated", date)
        url = response.xpath('//a[text()="Druckversion"]/@onclick').re_first(
            r"window\.open\('(.*)'\);"
        )
        il.add_css("title", "h1::text")
        if url:
            return scrapy.Request(
                response.urljoin(url), callback=self._parse_article, meta={"il": il}
            )
        else:
            il.add_value("category", "paywalled")
            il.add_css("content_html", ".primary")
            il.add_css("content_html", 'div[style="padding-top:10px;"] > h3')
            return il.load_item()
Beispiel #46
0
    def _parse_article(self, response):
        def _inline_video(videos, elem):
            if "data-video-id" in elem.attrib:
                source = lxml.etree.Element("source")
                source.attrib["src"] = videos[elem.attrib["data-video-id"]]
                source.attrib["type"] = "video/mp4"
                elem.insert(0, source)
                return elem
            else:
                # Header video, replace with placeholder image.
                parent = elem.getparent()
                parent.tag = "figure"
                if "data-placeholderbig" in elem.attrib:
                    src = elem.attrib["data-placeholderbig"]
                else:
                    src = elem.attrib["data-placeholder"]
                image = lxml.etree.Element("img")
                image.attrib["src"] = src
                return image

        def _inline_picture(elem):
            elem.tag = "img"
            src = elem.attrib.get("data-original")
            data_min_width = 1000 if src else -1
            for child in elem.getchildren():
                if child.tag != "span":
                    continue
                if int(child.attrib.get("data-min-width", 0)) > data_min_width:
                    src = child.attrib["data-src"]
                    data_min_width = int(child.attrib.get("data-min-width", 0))
                child.drop_tree()
            elem.attrib["src"] = src
            return elem

        audio_ids = response.css(
            '#BCaudioPlayer_eindeutig::attr("data-video-id")'
        ).extract()
        video_ids = response.css('.video-js::attr("data-video-id")').extract()
        media = {}
        for media_id in audio_ids + video_ids:
            api_response = yield self._build_api_request(media_id)
            api_response = json.loads(api_response.text)
            media[media_id] = sorted(
                (
                    video
                    for video in api_response["sources"]
                    if "src" in video and video.get("container") == "MP4"
                ),
                key=lambda v: v["size"],
            )[-1]["src"]

        remove_elems = [
            "h1",
            "script",
            "style",
            ".projectNav",
            ".socialShare",
            ".socialShare__headline",
            ".socialShare__icon",
            ".socialMedia",
            ".socialMedia__headline",
            ".whyRead",
            ".overlayCTA",
            ".authors",
            ".sectionBackground--colorTheme1",
            ".heroStage__copyright",
            ".heroStage__downLink",
            ".callToAction",
            ".print-action",
            ".internalLink span",
            ".addCommunity",
            ".download",
            ".BCaudioPlayer",
            ".icon-date",
            ".callToAction__button",
            'a[href^="http://partners.webmasterplan.com/click.asp"]',
            ".relatedSlider",
            ".imageLightbox",
            ".image__copyrightWrapper",
            ".image__zoom",
            ".image > .picture",
            ".imageHC",
        ]
        change_tags = {
            "div.heroStage__introText": "strong",
            ".quote": "blockquote",
            ".quote__label": "footer",
            ".supernumber": "blockquote",
            ".image": "figure",
            ".image__element": "div",
        }
        replace_elems = {
            "video": partial(_inline_video, media),
            ".picture": _inline_picture,
        }
        pullup_elems = {".image__content figcaption": 3}
        il = FeedEntryItemLoader(
            response=response,
            base_url=response.url,
            remove_elems=remove_elems,
            change_tags=change_tags,
            replace_elems=replace_elems,
            pullup_elems=pullup_elems,
        )
        il.add_value("link", response.url)
        il.add_css("author_name", ".sidebar .authors__name::text")
        il.add_css("title", "title::text", re="(.*) - Addendum")
        il.add_css("updated", 'meta[property="article:modified_time"]::attr(content)')
        # If not yet modified:
        il.add_css("updated", 'meta[property="article:published_time"]::attr(content)')
        il.add_css("content_html", ".content")
        for medium_id, medium_url in media.items():
            if medium_id not in audio_ids:
                il.add_value("enclosure", {"iri": medium_url, "type": "video/mp4"})
        item = il.load_item()
        # Save a copy before yielding it.
        item_podcast = deepcopy(item)
        yield item

        if audio_ids:
            # Export to podcast feed.
            il = FeedEntryItemLoader(item=item_podcast)
            il.add_value("path", "podcast")
            for medium_id, medium_url in media.items():
                if medium_id in audio_ids:
                    il.add_value("enclosure", {"iri": medium_url, "type": "audio/mp4"})
            yield il.load_item()
Beispiel #47
0
    def _parse_article(self, response):
        # Heuristic for news.ORF.at to to detect teaser articles.
        more = self._extract_link(
            response.css(
                ".story-story p > strong:contains('Mehr') + a::attr(href), "
                + ".story-story p > a:contains('Lesen Sie mehr')::attr(href)"
            ).extract_first()
        )
        if more and more != response.url:
            self.logger.debug("Detected teaser article, redirecting to {}".format(more))
            response = yield scrapy.Request(more, meta=response.meta)

        remove_elems = [
            ".byline",
            "h1",
            ".socialshare",
            ".socialShareWrapper",
            ".socialButtons",
            ".credit",
            ".toplink",
            ".offscreen",
            ".storyMeta",
            "script",
            ".oon-youtube-logo",
            ".vote",
            # redesign
            "#more-to-read-anchor",
            ".social-buttons",
            ".story-horizontal-ad",
            ".linkcard",
        ]
        pullup_elems = {
            ".remote .slideshow": 1,
            ".remote .instagram": 1,
            ".remote .facebook": 1,
            ".remote .twitter": 1,
            ".remote .youtube": 1,
            ".remote table": 1,
        }
        replace_elems = {
            ".video": "<p><em>Hinweis: Das eingebettete Video ist nur im Artikel "
            + "verfügbar.</em></p>"
        }
        change_attribs = {"img": {"data-src": "src", "srcset": "src"}}
        change_tags = {
            ".image": "figure",
            ".caption": "figcaption",
            ".fact": "blockquote",  # FM4
        }
        author, author_selector = self._extract_author(response)
        if author:
            self.logger.debug("Extracted possible author '{}'".format(author))
            # Remove the paragraph that contains the author.
            remove_elems.insert(0, author_selector)
        else:
            self.logger.debug("Could not extract author name")
            author = "{}.ORF.at".format(response.meta["path"])

        for slideshow in response.css(".slideshow"):
            link = response.urljoin(
                slideshow.css('::attr("data-slideshow-json-href")').extract_first()
            ).replace("jsonp", "json")
            slideshow_id = slideshow.css('::attr("id")').extract_first()
            slideshow_response = yield scrapy.Request(link)
            replace_elems["#{}".format(slideshow_id)] = self._create_slideshow_html(
                slideshow_response
            )

        il = FeedEntryItemLoader(
            response=response,
            remove_elems=remove_elems,
            pullup_elems=pullup_elems,
            replace_elems=replace_elems,
            change_attribs=change_attribs,
            change_tags=change_tags,
        )

        # The field is part of a JSON that is sometimes not valid, so don't bother with
        # parsing it properly.
        match = re.search(r'"datePublished": "([^"]+)"', response.text)
        if match:
            # news.ORF.at
            updated = match.group(1)
        else:
            # other
            updated = response.meta["updated"]
        il.add_value("updated", updated)
        il.add_css("title", "title::text", re=re.compile(r"(.*) - .*", flags=re.S))
        il.add_value("link", response.url)
        il.add_css("content_html", ".opener img")  # FM4, news
        il.add_css("content_html", ".story-lead-text")  # news
        il.add_css("content_html", "#ss-storyText")
        il.add_css("content_html", "#ss-storyContent")  # news
        il.add_value("author_name", author)
        if author in self._authors:
            il.add_value("path", author)
        il.add_value("path", response.meta["path"])
        il.add_value("category", response.meta["categories"])
        yield il.load_item()
Beispiel #48
0
    def _parse_article(self, response):
        def _fix_img_src(elem):
            src = elem.attrib.pop("data-zoom-src", None)
            # data-zoom-src is only valid if it starts with //images.derstandard.at.
            if src and src.startswith("//images.derstandard.at"):
                elem.attrib["src"] = src
            elem.attrib.pop("width", None)
            elem.attrib.pop("height", None)
            elem.attrib.pop("class", None)
            return elem

        remove_elems = [
            ".credits",
            ".owner-info",
            ".image-zoom",
            ".continue",
            ".sequence-number",
            ".js-embed-output",
            "#mycountrytalks-embed",
            # Remove self-promotion for (other) ressorts.
            '.js-embed-output-feeds a[href^="/r"]',
            '.js-embed-output-feeds a[href^="https://derstandard.at/"]',
            (
                ".js-embed-output-feeds "
                + 'img[src="https://images.derstandard.at/2018/10/18/'
                + 'Immobiliensuche202x122.png"]'
            ),
        ]
        change_tags = {
            "#media-list li .description": "figcaption",
            "#media-list li": "figure",
            "#media-list": "div",
            ".photo": "figure",
            ".caption": "figcaption",
        }
        replace_elems = {
            ".embedded-posting": "<p><em>Hinweis: Das eingebettete Posting ist nur "
            + "im Artikel verfügbar.</em></p>",
            # Replace every special script container with its unescaped content.
            "script.js-embed-template": lambda elem: (
                '<div class="js-embed-output-feeds">'
                + html.unescape(elem.text or "")
                + "</div>"
            ),
            "img": _fix_img_src,
        }
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
            replace_elems=replace_elems,
        )
        il.add_value("link", response.url)
        il.add_css("title", 'meta[property="og:title"]::attr(content)')
        for author in response.css("span.author::text").extract():
            # Sometimes the author name is messed up and written in upper case.
            # This happens usually for articles written by Günter Traxler.
            if author.upper() == author:
                author = author.title()
            il.add_value("author_name", author)
        il.add_value("path", response.meta["ressort"])
        il.add_value("updated", response.meta["updated"])
        il.add_css("category", "#breadcrumb .item a::text")
        blog_id = response.css("#userblogentry::attr(data-objectid)").extract_first()
        if blog_id:
            url = (
                "https://{}/userprofil/bloggingdelivery/blogeintrag?godotid={}"
            ).format(self.name, blog_id)
            return scrapy.Request(url, self._parse_blog_article, meta={"il": il})
        elif response.css("#feature-content"):
            cover_photo = response.css("#feature-cover-photo::attr(style)").re_first(
                r"\((.*)\)"
            )
            il.add_value("content_html", '<img src="{}">'.format(cover_photo))
            il.add_css("content_html", "#feature-cover-title h2")
            il.add_css("content_html", "#feature-content > .copytext")
            return il.load_item()
        else:
            il.add_css("content_html", "#content-aside")
            il.add_css("content_html", "#objectContent > .copytext")
            il.add_css("content_html", "#content-main > .copytext")
            il.add_css("content_html", ".slide")
            return il.load_item()