Beispiel #1
0
 def _get_xmlns_values(self) -> Set[str]:
     values = {
         helpers.fix_possible_url(tag.attrib["xmlns"])
         for tag in self._tree.iterfind(".[@xmlns]")
     }
     values |= {
         helpers.fix_possible_url(tag.attrib["xmlns"])
         for tag in self._tree.iterfind(".//*[@xmlns]")
     }
     return values
Beispiel #2
0
    def _get_base_url_from_html(self) -> str:
        tag = self._tree.find(".//base[@href]")
        if tag is not None:
            base_url = helpers.fix_possible_url(tag.attrib["href"])
            return base_url if is_url(base_url) else ""

        return ""
Beispiel #3
0
    def decode_proofpoint_v2(self) -> str:
        try:
            query_url = self.query_dict["u"][0]

            # In cases where this URL is encoded multiple times by Proofpoint, we need to replace the "-2D" first,
            # as that represents the "-" character that all of the other character encodings rely on. After this
            # character, it shouldn't matter the order in which the rest of the characters get replaced.
            possible_url = query_url.replace("-2D", "-")

            replacements = {
                "_": "/",
                "-26": "&",
                "-3A": ":",
                "-3D": "=",
                "-3F": "?",
                "-5F": "/"
            }

            for replace_encoded, replace_decoded in replacements.items():
                possible_url = possible_url.replace(replace_encoded,
                                                    replace_decoded)

            possible_url = helpers.fix_possible_url(possible_url)

            return possible_url if URL(possible_url).is_url else ""
        except KeyError:
            return ""
Beispiel #4
0
 def _get_src_values(self) -> Set[str]:
     values = set()
     for tag in self._tree.iterfind(".//*[@src]"):
         if self.base_url:
             values.add(helpers.fix_possible_value(tag.attrib["src"]))
         else:
             values.add(helpers.fix_possible_url(tag.attrib["src"]))
         tag.attrib["src"] = ""
     return values
Beispiel #5
0
    def _get_srcset_values(self) -> Set[str]:
        values = set()
        for tag in self._tree.iterfind(".//*[@srcset]"):
            value = helpers.fix_possible_url(tag.attrib["srcset"])
            splits = value.split(",")
            values |= {s.strip().split(" ")[0] for s in splits}

            tag.attrib["srcset"] = ""

        return values
Beispiel #6
0
    def find_urls(self,
                  strict: bool = True,
                  domain_as_url: bool = False) -> Set[str]:
        tok = tokenizer.UTF8Tokenizer(self.blob)

        token_iter = chain(
            tok.get_line_tokens(),
            tok.get_tokens_between_angle_brackets(strict=strict),
            tok.get_tokens_between_backticks(),
            tok.get_tokens_between_brackets(strict=strict),
            tok.get_tokens_between_curly_brackets(strict=strict),
            tok.get_tokens_between_double_quotes(),
            tok.get_tokens_between_parentheses(strict=strict),
            tok.get_tokens_between_single_quotes(),
            tok.get_sentences(),
        )

        split_token_iter = tok.get_split_tokens_after_replace(
            ["<", ">", "`", "[", "]", "{", "}", '"', "'", "(", ")"])

        if domain_as_url:
            tokens = set()
            for token in token_iter:
                if "." in token and "/" in token:
                    tokens.add(token)
                    continue

                if validators.domain(token):
                    tokens.add(token)

            for token in split_token_iter:
                if "." in token and "/" in token:
                    tokens.add(token)
                    continue

                if validators.domain(token):
                    tokens.add(token)
        else:
            tokens = {t for t in token_iter if "." in t and "/" in t}
            tokens |= {t for t in split_token_iter if "." in t and "/" in t}

        valid_urls = URLList()
        for token in tokens:
            # It is common for text files like email plaintext bodies to encode URLs in the form of:
            # http://domain.com<http://actualdomain.com>
            # where the text at the beginning is what will be displayed, and the text inside the <> is the
            # actual URL you will be taken to if you click on it. In these cases, we don't want that entire string
            # to be considered as a valid URL, but would rather have each of them as separate URLs.
            if "<" in token and token.endswith(">"):
                continue

            valid_urls.append(
                helpers.fix_possible_url(token, domain_as_url=domain_as_url))

        return set(valid_urls)
Beispiel #7
0
    def decode_mandrillapp(self) -> str:
        base64_string = self.query_dict["p"][0].replace("_", "/")
        decoded = base64.b64decode(f"{base64_string}===")

        try:
            outer_json = json.loads(decoded)
            inner_json = json.loads(outer_json["p"])
            possible_url = helpers.fix_possible_url(inner_json["url"])
            return possible_url if URL(possible_url).is_url else ""
        except json.JSONDecodeError:
            return ""
        except UnicodeDecodeError:
            return ""
Beispiel #8
0
    def decode_proofpoint_v3(self) -> str:
        try:
            match = re.search(r"v3/__(.+?)__;(.*?)!", self.value,
                              re.IGNORECASE)
            embedded_url = match.group(1)
            base64_characters = match.group(2)

            decoded_characters = base64.urlsafe_b64decode(
                f"{base64_characters}===").decode("utf-8")
            for i in range(len(decoded_characters)):
                embedded_url = embedded_url.replace("*", decoded_characters[i],
                                                    1)

            embedded_url = helpers.fix_possible_url(embedded_url)

            return embedded_url if URL(embedded_url).is_url else ""
        except AttributeError:
            return ""
Beispiel #9
0
    def find_urls(self) -> Set[str]:
        valid_urls = URLList()

        for document_write_url in self._find_document_write_urls():
            valid_urls.append(document_write_url)

        for window_location_url in self._get_window_location_href():
            valid_urls.append(helpers.fix_possible_url(window_location_url))

        for visible_url in self._find_visible_urls():
            valid_urls.append(visible_url)

        for meta_refresh_value in self._get_meta_refresh_values():
            valid_urls.append(meta_refresh_value)

        possible_urls = set()
        possible_urls |= {
            urljoin(self.base_url, u)
            for u in self._get_base_url_eligible_values()
        }

        srcset_values = self._get_srcset_values()
        possible_urls = {
            u
            for u in possible_urls
            if not any(srcset_value in u for srcset_value in srcset_values)
        }
        possible_urls |= {urljoin(self._base_url, u) for u in srcset_values}

        possible_urls |= self._get_tag_attribute_values()

        for possible_url in possible_urls:
            valid_urls.append(helpers.fix_possible_url(possible_url))

        tok = tokenizer.UTF8Tokenizer(self.tree_string)

        # TODO: itertools.product(*zip(string.lower(), string.upper()))
        token_iter = chain(
            tok.get_tokens_between_open_and_close_sequence('"http',
                                                           '"',
                                                           strict=True),
            tok.get_tokens_between_open_and_close_sequence('"ftp',
                                                           '"',
                                                           strict=True),
            tok.get_tokens_between_open_and_close_sequence("'http",
                                                           "'",
                                                           strict=True),
            tok.get_tokens_between_open_and_close_sequence("'ftp",
                                                           "'",
                                                           strict=True),
            tok.get_tokens_between_open_and_close_sequence('"HTTP',
                                                           '"',
                                                           strict=True),
            tok.get_tokens_between_open_and_close_sequence('"FTP',
                                                           '"',
                                                           strict=True),
            tok.get_tokens_between_open_and_close_sequence("'HTTP",
                                                           "'",
                                                           strict=True),
            tok.get_tokens_between_open_and_close_sequence("'FTP",
                                                           "'",
                                                           strict=True),
        )

        for token in token_iter:
            valid_urls.append(token)

        return set(valid_urls)
Beispiel #10
0
def test_fix_possible_url():
    assert helpers.fix_possible_url(
        "//domain.com\\index\u0000.html") == "https://domain.com/index.html"