def arrange_feed_top_level_element(self, dom: etree._Element, rss_url_prefix: str, parameters: Dict[str, str], favicon_url: str): # update or create icon tag element in channel urls = xpath(dom, "./channel/image/url") url_node: etree._Element if len(urls) == 0: image_node: etree._Element = etree.Element("image") url_node = etree.Element("url") image_node.append(url_node) xpath(dom, "./channel")[0].insert(0, image_node) else: url_node = urls[0] url_node.text = favicon_url # arrange links in channel other_link_nodes = xpath( dom, "./channel/*[local-name()='link' and @href] | ./channel/link") for link_node in other_link_nodes: if "href" in link_node.attrib: link_node.attrib["href"] = self._generated_complete_url( rss_url_prefix, parameters) else: link_node.text = self._generated_complete_url( rss_url_prefix, parameters)
def get_title(self, item: etree._Element) -> Optional[etree._Element]: title: Optional[etree._Elements] = None for t in xpath(item, ".//atom:title", namespaces=NAMESPACES): title = t break return title
def _manage_reddit_preview_images(self, content) -> str: """Use directly the image instead of the preview Args: content ([type]): html content Returns: str: the content where preview images have been replaced by target """ content_without_preview: str = content img_previews = re.findall(IMG_PREVIEW_REDDIT, content) for preview in img_previews: content_without_preview = content.replace( preview[0], "https://i.redd.it/%s" % preview[1]) dom = etree.HTML(content_without_preview) for a in xpath(dom, "//a"): if "href" in a.attrib and a.attrib["href"].find( "://preview.redd.it/") > -1: img = etree.Element("img") img.set( "src", a.attrib["href"].replace("preview.redd.it", "i.redd.it")) a.getparent().append(img) a.getparent().remove(a) content_without_preview = to_string(dom) return content_without_preview
def process_pictures(self, dom): for img in xpath(dom, '//img[@data-srcset]'): elements = img.attrib["data-srcset"].split(" ") for element in elements: if is_url_valid(element): img.attrib["src"] = element break
def _process_dugout(self, session: requests.Session, dom: etree._Element): for iframe in xpath(dom, "//iframe"): if "src" in iframe.attrib: dugout_ps: List[str] = re.findall( DUGOUT_VIDEO, get_attr_value(iframe, "src")) for dugout_p in dugout_ps: try: key = json.loads(base64.b64decode(dugout_p))["key"] dugout_metadata = json.loads( session.get( "https://cdn.jwplayer.com/v2/media/%s" % key).text) p1 = etree.Element("p") video = etree.Element("video") video.set("controls", "") video.set("preload", "auto") # best quality, last index video.set( "poster", dugout_metadata["playlist"][0]["images"] [-1]["src"]) video.set("width", "100%") source = etree.Element("source") source.set( "src", dugout_metadata["playlist"][0]["sources"] [-1]["file"]) video.append(source) p1.append(video) p2 = etree.Element("p") p2.text = dugout_metadata["title"] p3 = etree.Element("p") p3.text = dugout_metadata["description"] """ parents: List[etree._Element] = xpath( dom, '//*[@class="s24-art__content s24-art__resize"]') if len(parents) > 0: parents[0].append(p1) parents[0].append(p2) parents[0].append(p3) """ iframe.getparent().append(p1) iframe.getparent().append(p2) iframe.getparent().append(p3) iframe.getparent().remove(iframe) except Exception as err: self.log_info( "Unable to find dugout video, we ignore this exception and go on (%s)" % repr(err)) break
def _get_thumbnail_url_from_description( self, description: etree._Element) -> str: thumbnail_url: str = "" imgs = xpath(description, ".//img") if len(imgs) > 0: thumbnail_url = imgs[0].attrib["url"] else: m = re.match(IMG_URL_REGEX, to_string(description)) if m is not None: thumbnail_url = m.group(1) return thumbnail_url
def get_feed(self, parameters: dict, session: Session) -> str: rss_url: str = self.get_rss_url() if "sub" in parameters: rss_url = "https://www.reddit.com/r/%s/.rss" % parameters["sub"] feed = session.get(url=rss_url, headers={}).text feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip() # I probably do not use etree as I should dom = etree.fromstring(feed) for entry in xpath(dom, "//atom:entry", namespaces=NAMESPACES): content = cast( str, xpath(entry, "./atom:content", namespaces=NAMESPACES)[0].text) # try to replace thumbnail with real picture imgs = re.findall(r'"http[^"]*jpg"', content) thumb: str = "" other: str = "" for img in imgs: if "thumbs.redditmedia" in img: thumb = img else: other = img if thumb != "" and other != "": xpath(entry, "./atom:content", namespaces=NAMESPACES)[0].text = content.replace( thumb, other).replace("<td>  ", "</tr><tr><td>  ") for link in xpath(entry, "./atom:link", namespaces=NAMESPACES): link.attrib["href"] = self.get_handler_url_with_parameters( {"url": cast(str, link.attrib["href"].strip())}) feed = to_string(dom) return feed
def _get_rss_link_description(self, link_url: str) -> str: """find in rss file the item having the link_url and returns the description""" description = "" feed = requests.get(url=self.get_rss_url(), headers={}).text # I probably do not use etree as I should feed = feed.replace('<?xml version="1.0" encoding="utf-8"?>', '') dom = etree.fromstring(feed) descriptions = xpath( dom, "//item/link/text()[contains(., '%s')]/../../description" % link_url) if len(descriptions) > 0: description = html.unescape(cast(str, descriptions[0].text)) return description
def arrange_feed_top_level_element(self, dom: etree._Element, rss_url_prefix: str, parameters: Dict[str, str], favicon_url:str): icons = xpath(dom, "./icon") icon_node: etree._Element if len(icons) == 0: icon_node = etree.Element("icon") dom.insert(0, icon_node) else: icon_node = icons[0] icon_node.text = favicon_url icon = etree.Element("{%s}icon" % NAMESPACES["atom"], nsmap=NAMESPACES) icon.text = favicon_url dom.insert(0, icon) # arrange links in channel other_link_nodes = xpath( dom, "./*[local-name()='link' and @type='application/atom+xml']") for link_node in other_link_nodes: if "href" in link_node.attrib: link_node.attrib["href"] = self._generated_complete_url( rss_url_prefix, parameters) else: link_node.text = self._generated_complete_url( rss_url_prefix, parameters)
def _manage_blur_image_link(self, item: etree._Element, description: etree._Element): imgs: list = xpath(description, ".//img") if len(imgs) > 0: for img in imgs: img.attrib["src"] = "%s/thumbnails?url=%s&blur=true" % ( self.serving_url_prefix, quote_plus(cast(str, img.attrib["src"]))) else: srcs = re.findall('src="([^"]*)"', cast(str, description.text)) for src in srcs: description.text = description.text.replace( src, "%s/thumbnails?url=%s&blur=true" % (self.serving_url_prefix, quote_plus(src))) self.replace_img_links( item, self.serving_url_prefix + "/thumbnails?url=%s&blur=true")
def _post_process_tweets(self, dom: etree._Element): """ Process tweets, to replace twitter url by tweets' content """ has_tweets: bool = False for a in xpath(dom, "//a[contains(@href,'https://twitter.com/')]|//a[contains(@href,'//twitter.com/')]"): m = re.match(TWEETS_REGEX, a.attrib["href"]) if m is not None: tweet_id: str = m.group(1) has_tweets = True script = etree.Element("script") script.text = """ window.addEventListener("DOMContentLoaded", function() { var tweet_%s = document.getElementById("tweet_%s"); twttr.widgets.createTweet( '%s', tweet_%s, { conversation : 'none', // or all cards : 'visible', theme : '%s' }); }); document.getElementById("parent-%s").style.display = "none"; """ % ( tweet_id, tweet_id, tweet_id, tweet_id, "dark" if "dark" in self.parameters and self.parameters[ "dark"] == "true" else "light", tweet_id ) tweet_div = etree.Element("div") tweet_div.set("id", "tweet_%s" % tweet_id) a.getparent().addnext(script) a.getparent().addnext(tweet_div) a.getparent().set("id", "parent-%s" % tweet_id) a.getparent().remove(a) if has_tweets: script = etree.Element("script") script.set("src", "https://platform.twitter.com/widgets.js") script.set("sync", "") dom.append(script)
def get_feed(self, parameters: dict, session: requests.Session) -> str: if "filter" in parameters and parameters["filter"] == ( "tennis" or "football" or "rugby" or "cyclisme" or "golf"): # filter only on passed category, eg /sport24/rss/tennis feed = session.get(url=self.get_rss_url() % parameters["filter"], headers={}).text else: feed = session.get(url=self.get_rss_url() % "accueil", headers={}).text # I probably do not use etree as I should feed = feed.replace('<?xml version="1.0" encoding="UTF-8"?>', '') regex = re.compile(r"&(?!amp;|lt;|gt;)") myxml = regex.sub("&", feed) dom = etree.fromstring(myxml) description_img: str = "" xpath_expression = "//item[not(enclosure)]" if "filter" in parameters and parameters["filter"] == "flash": xpath_expression = "//item[enclosure]" description_img = "<img src=\"https://pbs.twimg.com/profile_images/932616523285516294/sqt32oQY.jpg\"/>" utils.dom_utils.delete_nodes(dom.xpath(xpath_expression)) for link in xpath(dom, "//item/link"): if link is not None and text(link) is not None: link.text = self.get_handler_url_with_parameters( {"url": text(link).strip()}) feed = to_string(dom) title = "" if "filter" in parameters: title = " - " + parameters["filter"] feed = feed.replace( "<title>Sport24 - Toute l'actualite</title>", "<title>Sport24%s</title>" % string.capwords(title)) if description_img != "": feed = feed.replace("<description>", "<description>" + description_img) return feed
def get_feed(self, parameters: dict, session: requests.Session) -> str: r = session.get(url=self.get_rss_url(), headers={}) # force encoding r.encoding = "utf-8" feed = r.text.replace("<link>", "<link>%s?url=" % self.url_prefix) feed = re.sub( r'<guid isPermaLink="false">https://lesjoiesducode.fr/\?p=[^<]*</guid>', r"", feed) # I probably do not use etree as I should feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip() dom = etree.fromstring(feed) for item in xpath(dom, "//item"): for child in item.getchildren( ): # did not find how to xpath content:encoded tag if child.tag.endswith("encoded"): c = self._clean_content('<div class="blog-post">' + child.text + '</div>') child.text = c # "<![CDATA[" + c + "]]>" return to_string(dom)
def get_feed(self, parameters: dict, session: requests.Session) -> str: feed = session.get(url=self.get_rss_url()).text # I probably do not use etree as I should feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip() dom = etree.fromstring(feed) if "filter" in parameters: # filter only on passed category xpath_expression = utils.dom_utils.get_xpath_expression_for_filters( parameters, "category/text() = '%s'", "not(category/text() = '%s')") utils.dom_utils.delete_nodes(dom.xpath(xpath_expression)) # replace video links, they must be processed by getContent for node in xpath(dom, "//link|//guid"): node.text = "%s" % self.get_handler_url_with_parameters( {"url": cast(str, node.text)}) feed = to_string(dom) return feed
def get_feed(self, parameters: dict, session: requests.Session) -> str: feed = session.get(url=self.get_rss_url(), headers={}).text feed = re.sub(r'<guid>[^<]*</guid>', '', feed) # I probably do not use etree as I should feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip() dom = etree.fromstring(feed) if "filter" in parameters: # filter only on passed category xpath_expression = utils.dom_utils.get_xpath_expression_for_filters( parameters, "link[contains(text(), '/%s/')]", "not(link[contains(text(), '/%s/')])") utils.dom_utils.delete_nodes(dom.xpath(xpath_expression)) for link in xpath(dom, "//item/link"): link.text = self.get_handler_url_with_parameters( {"url": cast(str, link.text).strip()}) feed = to_string(dom) return feed
def get_feed(self, parameters: dict, session: requests.Session) -> str: feed = session.get(url=self.get_rss_url(), headers={}).text # I probably do not use etree as I should feed = feed.replace('<?xml version="1.0" encoding="utf-8"?>', '') dom = etree.fromstring(feed) if "filter" in parameters: # filter only on passed category, eg /eurosport/rss/tennis xpath_expression = utils.dom_utils.get_xpath_expression_for_filters( parameters, "category/text() = '%s'", "not(category/text() = '%s')") utils.dom_utils.delete_nodes(dom.xpath(xpath_expression)) # replace video links, they must be processed by getContent for node in xpath(dom, "//link|//guid"): # if link.text.find("/video.shtml") > -1: node.text = "%s" % self.get_handler_url_with_parameters( {"url": cast(str, node.text)}) feed = to_string(dom).replace("\\u0027", "'").replace("\\u0022", "'") return feed
def _manage_title(self, dom: etree._Element): if "hidetitle" in self.parameters and self.parameters["hidetitle"] == "true": h1s = xpath(dom, "//h1") if len(h1s) > 0: h1s[0].getparent().remove(h1s[0])
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: page = session.get(url=url, headers={}) dom = etree.HTML(page.text) title = utils.dom_utils.get_content(dom, ["//h1"]) h1s = xpath(dom, "//h1") if len(h1s) > 0: #sometimes there is 2 h1 for the same title in the page h1s[0].getparent().remove(h1s[0]) imgsrc = "" imgs = dom.xpath("//img[@srcset]") if len(imgs) > 0: imgsrc = imgs[0].get("srcset") utils.dom_utils.delete_xpaths(dom, [ '//*[@class="s24-art-cross-linking"]', '//*[@class="fig-media__button"]', '//*[@class="s24-art-pub-top"]' ]) self._process_dugout(session, dom) for img in dom.xpath("//img[@data-srcset]"): if "src" not in img.attrib: img.attrib["src"] = img.get("data-srcset").split(" ")[0] contents = dom.xpath('//*[@class="s24-art__content s24-art__resize"]') if len(contents) > 0: if imgsrc != "": bodies = contents[0].xpath('//*[@class="s24-art-body"]') if len(bodies) > 0: img = etree.Element("img") img.set("src", imgsrc) bodies[0].insert(0, img) content = to_string(contents[0]) else: content = utils.dom_utils.get_content( dom, [ # handles golf.lefigaro structure '//article[contains(@class,"fig-content")]', # handles lefigaro.fr/sports '//article[contains(@class,"fig-main")]' ]) content = "%s%s" % (title, content) return PyRSSWContent( content, """ #sport24_handler .object-left { display: block; text-align: center; width: auto; max-width: fit-content; float: left; margin: 5px; } #sport24_handler .object-left img { float:none; margin:0; } #sport24_handler .embed { clear:both; } #sport24_handler div.object-right { text-align:center; } """)
def replace_img_links(self, item: etree._Element, replace_with: str): for media in xpath(item, ".//*[local-name()='thumbnail']"): # media:thumbnail tag media.attrib["url"] = replace_with % quote_plus( cast(str, media.attrib["url"]))