def clean_element(self, elem: html.Element): tag = elem.tag if tag in [ "script", "noscript", "style", "meta", "input", "iframe", "select", "link", "font" ]: elem.getparent().remove(elem) return if tag == etree.Comment: elem.getparent().remove(elem) return if tag == etree.ProcessingInstruction: elem.getparent().remove(elem) return if tag == "form": a = elem.attrib.get("action") if a != None: del elem.attrib["action"] x = elem.attrib.get("onsubmit") if x != None: del elem.attrib["onsubmit"] if tag == "a": href = elem.attrib.get("href") if href != None and (href.startswith("https://twitter.com") or href.startswith("http://twitter.com") or href.startswith("https://t.co")): self.remove_twitter_cluster(elem.getparent()) if tag == "svg": while len(elem): del elem[0] else: for ch in elem: self.clean_element(ch) if tag in ["div", "span"]: if self.is_empty(elem): elem.getparent().remove(elem) return if self.mark_special_case(elem): return elif tag in ["a"]: if self.mark_special_case(elem): return # strip spaces from simple links if len(elem) > 0: elem[-1].tail = None elif elem.text != None: elem.text = elem.text.strip() self.clean_attributes(elem)
def regularize_attrib(self, elem: html.Element, n: str): v = elem.attrib[n] if v == None: return if n == "id": if self.is_guid(v): elem.attrib[n] = "[guid]" elif n == "href": if v.startswith("#") and self.is_guid(v): elem.attrib[n] = "#" elif v.startswith("https://www.google.com/url?q="): if self.trace: logger.info(f"google >>{v}<<") idx = v.find("&ust=") if idx > 0: if self.trace: logger.info("removed ust") elem.attrib[n] = v[0:idx] elif v.startswith("https://twitter.com" ) and elem.text != None and elem.text.startswith( "@"): elem.getparent().remove(elem) elif n == "src": if v.startswith("https://www.youtube.com/"): elem.attrib[n] = "https://www.youtube.com"
def remove_twitter_cluster(self, elem: html.Element): if self.trace: logger.info(f" twitter: check") # check it if elem.tag != "span": if self.trace: logger.info(f" twitter: not span") return for e in elem: if e.tag != "a": if self.trace: logger.info(f" twitter: child not a") return href = e.attrib["href"] if href == None: if self.trace: logger.info(f" twitter: child no href") return if not (href.startswith("http://twitter.com") or href.startswith("https://twitter.com") or href.startswith("https://t.co")): if self.trace: logger.info(f" twitter: child bad link") return p = elem.getparent() if len(p) != 1 and len(p) != 2: if self.trace: logger.info(f" twitter: parent length ({len(p)})") return p = p.getparent() if len(p) != 1 and len(p) != 2: if self.trace: logger.info(f" twitter: parent.parent length ({len(p)})") return elem_next = p[len(p) - 1] if elem_next == None: if self.trace: logger.info(f" twitter: missing next") return text = html.tostring(elem_next) if self.trace: logger.info(f" twitter next >>{text}<<") if not b"> ago" in text: if self.trace: logger.info(f" twitter: missing ago") return # mark it for removal if self.trace: logger.info(f" twitter: remove") p.text = "" for e in p: self.to_remove.append(e)