def remove_identical_nodes(elem1: html.Element, elem2: html.Element) -> bool: s1 = html.tostring(elem1) s2 = html.tostring(elem2) if s1 == s2: return True to_remove = [] n = min(len(elem1), len(elem2)) for i in range(n): is_same = remove_identical_nodes(elem1[i], elem2[i]) if is_same: to_remove.append(i) to_remove.reverse() for i in to_remove: elem1.remove(elem1[i]) elem2.remove(elem2[i]) return False
def regularize_other(elem: html.Element): " other cases " if elem.tag == "input": # AZ if elem.attrib.get("type") == "hidden": elem.attrib["value"] = "[removed]" elif elem.tag == "div": # CA if elem.attrib.get("id") == "DeltaFormDigest": elem.text = "[removed]" while len(elem) > 0: elem.remove(elem[0]) # IL elif safe_starts_with(elem.attrib.get("class"), "view view-tweets"): elem.attrib["class"] = "[removed]" # OH elif safe_contains(elem.attrib.get("class"), " id-"): elem.attrib["class"] = "[removed]" elif elem.tag == "script": # CO if safe_starts_with(elem.text, "jQuery.extend(Drupal.setting"): elem.text = "[removed]" elif safe_starts_with(elem.text, "window.NREUM"): elem.text = "[removed]" # OH elif safe_contains(elem.text, "var WASReqURL = ") or safe_contains( elem.text, "wpModules.theme.WindowUtils"): elem.text = "[removed]" elif safe_contains(elem.attrib.get("src"), "/wps/contenthandler"): elem.attrib["src"] = "/wps/contenthandler" # KY elif safe_contains(elem.text, "var formDigestElement = "): elem.text = "[removed]" elif safe_contains(elem.text, "RegisterSod("): elem.text = "[removed]" # MO and NJ elif safe_contains(elem.attrib.get("src"), "_Incapsula_Resource"): elem.attrib["src"] = "/_Incapsula_Resource" # NE elif safe_contains(elem.text, "var g_correlationId = '"): elem.text = "[removed]" # PA elif safe_contains(elem.text, "var MSOWebPartPageFormName = 'aspnetForm'"): elem.text = "[removed]" # RI elif safe_contains(elem.text, 'window["blob') or safe_contains( elem.text, 'window["bob'): elem.text = "[removed]" # TX elif safe_starts_with(elem.attrib.get("id"), "EktronScriptBlock"): elem.attrib["id"] = "EktronScriptBlock" elem.text = "[removed]" elif elem.tag == "noscript": # RI and WA elem.text = "" while len(elem) > 0: elem.remove(elem[0]) elif elem.tag == "meta": # CT if elem.attrib.get("name") == "VIcurrentDateTime": elem.attrib["content"] = "[removed]" elif elem.tag == "link": # OH if safe_starts_with(elem.attrib.get("href"), "/wps/portal/gov"): elem.attrib["id"] = "[removed]" elem.attrib["href"] = "[removed]" elif elem.tag == "a": # OH if elem.attrib.get("class") == "left-navigation__link": elem.attrib["href"] = "[removed]" elem.text = "" elif elem.tag == "body": # KY if safe_starts_with(elem.attrib.get("class"), "brwsr-safari"): elem.attrib["class"] = "brwsr-safari" for ch in elem: regularize_other(ch)