def remove_identical_nodes(elem1: html.Element, elem2: html.Element) -> bool:

    s1 = html.tostring(elem1)
    s2 = html.tostring(elem2)
    if s1 == s2: return True

    to_remove = []
    n = min(len(elem1), len(elem2))
    for i in range(n):
        is_same = remove_identical_nodes(elem1[i], elem2[i])
        if is_same: to_remove.append(i)

    to_remove.reverse()
    for i in to_remove:
        elem1.remove(elem1[i])
        elem2.remove(elem2[i])

    return False
Esempio n. 2
0
def regularize_other(elem: html.Element):
    " other cases "

    if elem.tag == "input":
        # AZ
        if elem.attrib.get("type") == "hidden":
            elem.attrib["value"] = "[removed]"

    elif elem.tag == "div":
        # CA
        if elem.attrib.get("id") == "DeltaFormDigest":
            elem.text = "[removed]"
            while len(elem) > 0:
                elem.remove(elem[0])
        # IL
        elif safe_starts_with(elem.attrib.get("class"), "view view-tweets"):
            elem.attrib["class"] = "[removed]"
        # OH
        elif safe_contains(elem.attrib.get("class"), " id-"):
            elem.attrib["class"] = "[removed]"

    elif elem.tag == "script":

        # CO
        if safe_starts_with(elem.text, "jQuery.extend(Drupal.setting"):
            elem.text = "[removed]"
        elif safe_starts_with(elem.text, "window.NREUM"):
            elem.text = "[removed]"
        # OH
        elif safe_contains(elem.text, "var WASReqURL = ") or safe_contains(
                elem.text, "wpModules.theme.WindowUtils"):
            elem.text = "[removed]"
        elif safe_contains(elem.attrib.get("src"), "/wps/contenthandler"):
            elem.attrib["src"] = "/wps/contenthandler"
        # KY
        elif safe_contains(elem.text, "var formDigestElement = "):
            elem.text = "[removed]"
        elif safe_contains(elem.text, "RegisterSod("):
            elem.text = "[removed]"
        # MO and NJ
        elif safe_contains(elem.attrib.get("src"), "_Incapsula_Resource"):
            elem.attrib["src"] = "/_Incapsula_Resource"
        # NE
        elif safe_contains(elem.text, "var g_correlationId = '"):
            elem.text = "[removed]"
        # PA
        elif safe_contains(elem.text,
                           "var MSOWebPartPageFormName = 'aspnetForm'"):
            elem.text = "[removed]"
        # RI
        elif safe_contains(elem.text, 'window["blob') or safe_contains(
                elem.text, 'window["bob'):
            elem.text = "[removed]"
        # TX
        elif safe_starts_with(elem.attrib.get("id"), "EktronScriptBlock"):
            elem.attrib["id"] = "EktronScriptBlock"
            elem.text = "[removed]"
    elif elem.tag == "noscript":
        # RI and WA
        elem.text = ""
        while len(elem) > 0:
            elem.remove(elem[0])
    elif elem.tag == "meta":
        # CT
        if elem.attrib.get("name") == "VIcurrentDateTime":
            elem.attrib["content"] = "[removed]"
    elif elem.tag == "link":
        # OH
        if safe_starts_with(elem.attrib.get("href"), "/wps/portal/gov"):
            elem.attrib["id"] = "[removed]"
            elem.attrib["href"] = "[removed]"
    elif elem.tag == "a":
        # OH
        if elem.attrib.get("class") == "left-navigation__link":
            elem.attrib["href"] = "[removed]"
            elem.text = ""
    elif elem.tag == "body":
        # KY
        if safe_starts_with(elem.attrib.get("class"), "brwsr-safari"):
            elem.attrib["class"] = "brwsr-safari"

    for ch in elem:
        regularize_other(ch)