Exemple #1
0
    def arrange_feed_top_level_element(self, dom: etree._Element,
                                       rss_url_prefix: str,
                                       parameters: Dict[str, str],
                                       favicon_url: str):
        # update or create icon tag element in channel
        urls = xpath(dom, "./channel/image/url")
        url_node: etree._Element
        if len(urls) == 0:
            image_node: etree._Element = etree.Element("image")
            url_node = etree.Element("url")
            image_node.append(url_node)
            xpath(dom, "./channel")[0].insert(0, image_node)
        else:
            url_node = urls[0]
        url_node.text = favicon_url

        # arrange links in channel
        other_link_nodes = xpath(
            dom, "./channel/*[local-name()='link' and @href] | ./channel/link")
        for link_node in other_link_nodes:
            if "href" in link_node.attrib:
                link_node.attrib["href"] = self._generated_complete_url(
                    rss_url_prefix, parameters)
            else:
                link_node.text = self._generated_complete_url(
                    rss_url_prefix, parameters)
Exemple #2
0
    def get_title(self, item: etree._Element) -> Optional[etree._Element]:
        title: Optional[etree._Elements] = None
        for t in xpath(item, ".//atom:title", namespaces=NAMESPACES):
            title = t
            break

        return title
Exemple #3
0
    def _manage_reddit_preview_images(self, content) -> str:
        """Use directly the image instead of the preview

        Args:
            content ([type]): html content

        Returns:
            str: the content where preview images have been replaced by target
        """
        content_without_preview: str = content
        img_previews = re.findall(IMG_PREVIEW_REDDIT, content)
        for preview in img_previews:
            content_without_preview = content.replace(
                preview[0], "https://i.redd.it/%s" % preview[1])

        dom = etree.HTML(content_without_preview)
        for a in xpath(dom, "//a"):
            if "href" in a.attrib and a.attrib["href"].find(
                    "://preview.redd.it/") > -1:
                img = etree.Element("img")
                img.set(
                    "src", a.attrib["href"].replace("preview.redd.it",
                                                    "i.redd.it"))
                a.getparent().append(img)
                a.getparent().remove(a)

        content_without_preview = to_string(dom)

        return content_without_preview
Exemple #4
0
 def process_pictures(self, dom):
     for img in xpath(dom, '//img[@data-srcset]'):
         elements = img.attrib["data-srcset"].split(" ")
         for element in elements:
             if is_url_valid(element):
                 img.attrib["src"] = element
                 break
Exemple #5
0
    def _process_dugout(self, session: requests.Session, dom: etree._Element):
        for iframe in xpath(dom, "//iframe"):
            if "src" in iframe.attrib:
                dugout_ps: List[str] = re.findall(
                    DUGOUT_VIDEO, get_attr_value(iframe, "src"))
                for dugout_p in dugout_ps:
                    try:
                        key = json.loads(base64.b64decode(dugout_p))["key"]
                        dugout_metadata = json.loads(
                            session.get(
                                "https://cdn.jwplayer.com/v2/media/%s" %
                                key).text)

                        p1 = etree.Element("p")
                        video = etree.Element("video")
                        video.set("controls", "")
                        video.set("preload", "auto")
                        # best quality, last index
                        video.set(
                            "poster", dugout_metadata["playlist"][0]["images"]
                            [-1]["src"])
                        video.set("width", "100%")

                        source = etree.Element("source")
                        source.set(
                            "src", dugout_metadata["playlist"][0]["sources"]
                            [-1]["file"])

                        video.append(source)
                        p1.append(video)

                        p2 = etree.Element("p")
                        p2.text = dugout_metadata["title"]

                        p3 = etree.Element("p")
                        p3.text = dugout_metadata["description"]
                        """
                        parents: List[etree._Element] = xpath(
                            dom, '//*[@class="s24-art__content s24-art__resize"]')

                        if len(parents) > 0:
                            parents[0].append(p1)
                            parents[0].append(p2)
                            parents[0].append(p3)
                        """
                        iframe.getparent().append(p1)
                        iframe.getparent().append(p2)
                        iframe.getparent().append(p3)
                        iframe.getparent().remove(iframe)
                    except Exception as err:
                        self.log_info(
                            "Unable to find dugout video, we ignore this exception and go on (%s)"
                            % repr(err))
                    break
Exemple #6
0
    def _get_thumbnail_url_from_description(
            self, description: etree._Element) -> str:
        thumbnail_url: str = ""
        imgs = xpath(description, ".//img")
        if len(imgs) > 0:
            thumbnail_url = imgs[0].attrib["url"]
        else:
            m = re.match(IMG_URL_REGEX, to_string(description))
            if m is not None:
                thumbnail_url = m.group(1)

        return thumbnail_url
Exemple #7
0
    def get_feed(self, parameters: dict, session: Session) -> str:
        rss_url: str = self.get_rss_url()

        if "sub" in parameters:
            rss_url = "https://www.reddit.com/r/%s/.rss" % parameters["sub"]

        feed = session.get(url=rss_url, headers={}).text

        feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip()
        # I probably do not use etree as I should
        dom = etree.fromstring(feed)

        for entry in xpath(dom, "//atom:entry", namespaces=NAMESPACES):
            content = cast(
                str,
                xpath(entry, "./atom:content", namespaces=NAMESPACES)[0].text)

            # try to replace thumbnail with real picture
            imgs = re.findall(r'"http[^"]*jpg"', content)
            thumb: str = ""
            other: str = ""
            for img in imgs:
                if "thumbs.redditmedia" in img:
                    thumb = img
                else:
                    other = img
            if thumb != "" and other != "":
                xpath(entry, "./atom:content",
                      namespaces=NAMESPACES)[0].text = content.replace(
                          thumb, other).replace("<td> &#32;",
                                                "</tr><tr><td> &#32;")

            for link in xpath(entry, "./atom:link", namespaces=NAMESPACES):
                link.attrib["href"] = self.get_handler_url_with_parameters(
                    {"url": cast(str, link.attrib["href"].strip())})

        feed = to_string(dom)

        return feed
Exemple #8
0
    def _get_rss_link_description(self, link_url: str) -> str:
        """find in rss file the item having the link_url and returns the description"""
        description = ""
        feed = requests.get(url=self.get_rss_url(), headers={}).text
        # I probably do not use etree as I should
        feed = feed.replace('<?xml version="1.0" encoding="utf-8"?>', '')
        dom = etree.fromstring(feed)
        descriptions = xpath(
            dom, "//item/link/text()[contains(., '%s')]/../../description" %
            link_url)
        if len(descriptions) > 0:
            description = html.unescape(cast(str, descriptions[0].text))

        return description
Exemple #9
0
    def arrange_feed_top_level_element(self, dom: etree._Element, rss_url_prefix: str, parameters: Dict[str, str], favicon_url:str):
        icons = xpath(dom, "./icon")
        icon_node: etree._Element
        if len(icons) == 0:
            icon_node = etree.Element("icon")
            dom.insert(0, icon_node)
        else:
            icon_node = icons[0]
        icon_node.text = favicon_url
        icon = etree.Element("{%s}icon" %
                                  NAMESPACES["atom"], nsmap=NAMESPACES)
        icon.text = favicon_url
        dom.insert(0, icon)

        # arrange links in channel
        other_link_nodes = xpath(
            dom, "./*[local-name()='link' and @type='application/atom+xml']")
        for link_node in other_link_nodes:
            if "href" in link_node.attrib:
                link_node.attrib["href"] = self._generated_complete_url(
                    rss_url_prefix, parameters)
            else:
                link_node.text = self._generated_complete_url(
                    rss_url_prefix, parameters)
Exemple #10
0
 def _manage_blur_image_link(self, item: etree._Element,
                             description: etree._Element):
     imgs: list = xpath(description, ".//img")
     if len(imgs) > 0:
         for img in imgs:
             img.attrib["src"] = "%s/thumbnails?url=%s&blur=true" % (
                 self.serving_url_prefix,
                 quote_plus(cast(str, img.attrib["src"])))
     else:
         srcs = re.findall('src="([^"]*)"', cast(str, description.text))
         for src in srcs:
             description.text = description.text.replace(
                 src, "%s/thumbnails?url=%s&blur=true" %
                 (self.serving_url_prefix, quote_plus(src)))
     self.replace_img_links(
         item, self.serving_url_prefix + "/thumbnails?url=%s&blur=true")
Exemple #11
0
    def _post_process_tweets(self, dom: etree._Element):
        """
            Process tweets, to replace twitter url by tweets' content
        """

        has_tweets: bool = False
        for a in xpath(dom, "//a[contains(@href,'https://twitter.com/')]|//a[contains(@href,'//twitter.com/')]"):
            m = re.match(TWEETS_REGEX, a.attrib["href"])
            if m is not None:
                tweet_id: str = m.group(1)
                has_tweets = True
                script = etree.Element("script")
                script.text = """
                    window.addEventListener("DOMContentLoaded", function() {
                        var tweet_%s = document.getElementById("tweet_%s");
                        twttr.widgets.createTweet(
                        '%s', tweet_%s,
                        {
                            conversation : 'none',    // or all
                            cards        : 'visible',
                            theme        : '%s'
                        });
                    });
                    document.getElementById("parent-%s").style.display = "none";
                """ % (
                    tweet_id,
                    tweet_id,
                    tweet_id,
                    tweet_id,
                    "dark" if "dark" in self.parameters and self.parameters[
                        "dark"] == "true" else "light",
                    tweet_id
                )
                tweet_div = etree.Element("div")
                tweet_div.set("id", "tweet_%s" % tweet_id)
                a.getparent().addnext(script)
                a.getparent().addnext(tweet_div)
                a.getparent().set("id", "parent-%s" % tweet_id)
                a.getparent().remove(a)

        if has_tweets:
            script = etree.Element("script")
            script.set("src", "https://platform.twitter.com/widgets.js")
            script.set("sync", "")
            dom.append(script)
Exemple #12
0
    def get_feed(self, parameters: dict, session: requests.Session) -> str:
        if "filter" in parameters and parameters["filter"] == (
                "tennis" or "football" or "rugby" or "cyclisme" or "golf"):
            # filter only on passed category, eg /sport24/rss/tennis
            feed = session.get(url=self.get_rss_url() % parameters["filter"],
                               headers={}).text
        else:
            feed = session.get(url=self.get_rss_url() % "accueil",
                               headers={}).text

        # I probably do not use etree as I should
        feed = feed.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
        regex = re.compile(r"&(?!amp;|lt;|gt;)")
        myxml = regex.sub("&amp;", feed)
        dom = etree.fromstring(myxml)
        description_img: str = ""

        xpath_expression = "//item[not(enclosure)]"
        if "filter" in parameters and parameters["filter"] == "flash":
            xpath_expression = "//item[enclosure]"
            description_img = "<img src=\"https://pbs.twimg.com/profile_images/932616523285516294/sqt32oQY.jpg\"/>"

        utils.dom_utils.delete_nodes(dom.xpath(xpath_expression))

        for link in xpath(dom, "//item/link"):
            if link is not None and text(link) is not None:
                link.text = self.get_handler_url_with_parameters(
                    {"url": text(link).strip()})

        feed = to_string(dom)

        title = ""
        if "filter" in parameters:
            title = " - " + parameters["filter"]

        feed = feed.replace(
            "<title>Sport24 - Toute l'actualite</title>",
            "<title>Sport24%s</title>" % string.capwords(title))

        if description_img != "":
            feed = feed.replace("<description>",
                                "<description>" + description_img)

        return feed
Exemple #13
0
    def get_feed(self, parameters: dict, session: requests.Session) -> str:
        r = session.get(url=self.get_rss_url(), headers={})

        # force encoding
        r.encoding = "utf-8"
        feed = r.text.replace("<link>", "<link>%s?url=" % self.url_prefix)
        feed = re.sub(
            r'<guid isPermaLink="false">https://lesjoiesducode.fr/\?p=[^<]*</guid>',
            r"", feed)

        # I probably do not use etree as I should
        feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip()

        dom = etree.fromstring(feed)
        for item in xpath(dom, "//item"):
            for child in item.getchildren(
            ):  # did not find how to xpath content:encoded tag
                if child.tag.endswith("encoded"):
                    c = self._clean_content('<div class="blog-post">' +
                                            child.text + '</div>')
                    child.text = c  # "<![CDATA[" + c + "]]>"

        return to_string(dom)
Exemple #14
0
    def get_feed(self, parameters: dict, session: requests.Session) -> str:
        feed = session.get(url=self.get_rss_url()).text

        # I probably do not use etree as I should
        feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip()
        dom = etree.fromstring(feed)

        if "filter" in parameters:
            # filter only on passed category
            xpath_expression = utils.dom_utils.get_xpath_expression_for_filters(
                parameters, "category/text() = '%s'",
                "not(category/text() = '%s')")

            utils.dom_utils.delete_nodes(dom.xpath(xpath_expression))

        # replace video links, they must be processed by getContent
        for node in xpath(dom, "//link|//guid"):
            node.text = "%s" % self.get_handler_url_with_parameters(
                {"url": cast(str, node.text)})

        feed = to_string(dom)

        return feed
Exemple #15
0
    def get_feed(self, parameters: dict, session: requests.Session) -> str:
        feed = session.get(url=self.get_rss_url(), headers={}).text

        feed = re.sub(r'<guid>[^<]*</guid>', '', feed)

        # I probably do not use etree as I should
        feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip()
        dom = etree.fromstring(feed)

        if "filter" in parameters:
            # filter only on passed category
            xpath_expression = utils.dom_utils.get_xpath_expression_for_filters(
                parameters, "link[contains(text(), '/%s/')]",
                "not(link[contains(text(), '/%s/')])")

            utils.dom_utils.delete_nodes(dom.xpath(xpath_expression))

        for link in xpath(dom, "//item/link"):
            link.text = self.get_handler_url_with_parameters(
                {"url": cast(str, link.text).strip()})

        feed = to_string(dom)

        return feed
Exemple #16
0
    def get_feed(self, parameters: dict, session: requests.Session) -> str:
        feed = session.get(url=self.get_rss_url(), headers={}).text

        # I probably do not use etree as I should
        feed = feed.replace('<?xml version="1.0" encoding="utf-8"?>', '')
        dom = etree.fromstring(feed)

        if "filter" in parameters:
            # filter only on passed category, eg /eurosport/rss/tennis
            xpath_expression = utils.dom_utils.get_xpath_expression_for_filters(
                parameters, "category/text() = '%s'",
                "not(category/text() = '%s')")

            utils.dom_utils.delete_nodes(dom.xpath(xpath_expression))

        # replace video links, they must be processed by getContent
        for node in xpath(dom, "//link|//guid"):
            # if link.text.find("/video.shtml") > -1:
            node.text = "%s" % self.get_handler_url_with_parameters(
                {"url": cast(str, node.text)})

        feed = to_string(dom).replace("\\u0027", "'").replace("\\u0022", "'")

        return feed
Exemple #17
0
 def _manage_title(self, dom: etree._Element):
     if "hidetitle" in self.parameters and self.parameters["hidetitle"] == "true":
         h1s = xpath(dom, "//h1")
         if len(h1s) > 0:
             h1s[0].getparent().remove(h1s[0])
Exemple #18
0
    def get_content(self, url: str, parameters: dict,
                    session: requests.Session) -> PyRSSWContent:
        page = session.get(url=url, headers={})

        dom = etree.HTML(page.text)
        title = utils.dom_utils.get_content(dom, ["//h1"])
        h1s = xpath(dom, "//h1")
        if len(h1s) > 0:
            #sometimes there is 2 h1 for the same title in the page
            h1s[0].getparent().remove(h1s[0])
        imgsrc = ""
        imgs = dom.xpath("//img[@srcset]")
        if len(imgs) > 0:
            imgsrc = imgs[0].get("srcset")

        utils.dom_utils.delete_xpaths(dom, [
            '//*[@class="s24-art-cross-linking"]',
            '//*[@class="fig-media__button"]', '//*[@class="s24-art-pub-top"]'
        ])

        self._process_dugout(session, dom)

        for img in dom.xpath("//img[@data-srcset]"):
            if "src" not in img.attrib:
                img.attrib["src"] = img.get("data-srcset").split(" ")[0]

        contents = dom.xpath('//*[@class="s24-art__content s24-art__resize"]')
        if len(contents) > 0:
            if imgsrc != "":
                bodies = contents[0].xpath('//*[@class="s24-art-body"]')
                if len(bodies) > 0:
                    img = etree.Element("img")
                    img.set("src", imgsrc)
                    bodies[0].insert(0, img)
            content = to_string(contents[0])
        else:
            content = utils.dom_utils.get_content(
                dom,
                [
                    # handles golf.lefigaro structure
                    '//article[contains(@class,"fig-content")]',
                    # handles lefigaro.fr/sports
                    '//article[contains(@class,"fig-main")]'
                ])

        content = "%s%s" % (title, content)
        return PyRSSWContent(
            content, """
            #sport24_handler .object-left {
                display: block;
                text-align: center;
                width: auto;
                max-width: fit-content;
                float: left;
                margin: 5px;
            }

            #sport24_handler .object-left img {
                float:none;
                margin:0;
            }

            #sport24_handler .embed {
                clear:both;
            }
            
            #sport24_handler div.object-right {
                text-align:center;
            }
        """)
Exemple #19
0
 def replace_img_links(self, item: etree._Element, replace_with: str):
     for media in xpath(item, ".//*[local-name()='thumbnail']"):
         # media:thumbnail tag
         media.attrib["url"] = replace_with % quote_plus(
             cast(str, media.attrib["url"]))