Esempio n. 1
0
    def __create_positional_word_matrix__(self, url_data: UrlData) \
            -> np.ndarray:
        '''
        Takes the url_date and creates a full word embedding from this of shape
        (N, embedding_dim), where N is equal to the sum of the lengths of
        the sub_domains, main_domains, paths, args + 1 for the TLD. The
        embedding will be positional, meaning it will have the embed vectors
        for each area of the URL starting at fixed locations and truncated
        to fixed lengths. I.e. [0, 0, x, y, 0, 0, d, f, 0, ...]

        Args:
            url_data (UrlData): 4-tuple of url data

        Returns:
            word_matrix (np.ndarray): Full positional word embedding matrix of
                                      shape (N, embedding_dim)
        '''
        _, domains, path, args = url_data
        sub_domains, main_domain, domain_ending = domains
        args_flat = flatten_twice(args)

        sub_domain_mat = self.__word_embed__(sub_domains,
                                             self.sub_domain_max_len)
        main_domain_mat = self.__word_embed__(main_domain,
                                              self.main_domain_max_len)
        domain_end_vec = self.__word_embed__([domain_ending], 1)
        path_mat = self.__word_embed__(path, self.path_max_len)
        args_mat = self.__word_embed__(args_flat, self.arg_max_len)
        word_matrix = np.concatenate([
            sub_domain_mat, main_domain_mat, domain_end_vec, path_mat, args_mat
        ])
        return word_matrix
Esempio n. 2
0
def flatten_url_data(url_data: UrlData) -> List[str]:
    '''
    Helper function to transform the 4-tuple of UrlData returned by
    url_tokenizer into a simple list of strings. Can be helpful to simplify
    the problem if the position of words is not relevant.

    Args:
        url_data (UrlData): The UrlData 4-tuple returned by url_tokenizer

    Returns:
        words (List[str]): A flat list of all the words

    Examples:
        >>> url_data = url_tokenizer('http://some.test.com/path')
        >>> flatten_url_data(url_data)
        ['http', 'some', 'test', 'com', 'path']
    '''
    protocol, domains, path, args = url_data
    sub_domain, main_domain, tld = domains
    words = ([protocol] + sub_domain + main_domain + [tld] + path +
             flatten_twice(args))
    return words
Esempio n. 3
0
    def __create_sequential_word_matrix__(self, url_data: UrlData) \
            -> np.ndarray:
        '''
        Takes the url_date and creates a full word embedding from this of shape
        (N, embedding_dim), where N is equal to the sum of the lengths of
        the sub_domains, main_domains, paths, args + 1 for the TLD. The
        embedding will be sequential, meaning it will have the embed vectors
        for each word at the beginning and then padded/truncated to fixed
        length N. I.e. [x, y, z, k, d, v, f, 0, 0, 0, ...]

        Args:
            url_data (UrlData): 4-tuple of url data

        Returns:
            word_matrix (np.ndarray): Full sequential word embedding matrix of
                                      shape (N, embedding_dim)
        '''
        _, domains, path, args = url_data
        sub_domains, main_domain, domain_ending = domains
        args_flat = flatten_twice(args)
        full = sub_domains + main_domain + [domain_ending] + path + args_flat
        word_matrix = self.__word_embed__(full, self.N)
        return word_matrix
Esempio n. 4
0
    def __create_hand_picked_features__(self, url: str,
                                        url_data: UrlData) -> np.ndarray:
        '''
        Creates hand-picked features based on the url data

        Args:
            url (str): URL string
            url_data (UrlData): 4-tuple of URL data

        Returns:
            feat_vec (np.ndarray): 1D vector of hand-picked features
        '''
        words = flatten_url_data(url_data)

        url_decoded = url_html_decoder(url)
        protocol, domains_raw, path_raw, args_raw = \
            url_raw_splitter(url_decoded)

        domain_len = len(domains_raw)
        path_len = len(path_raw)
        args_len = len(args_raw)

        dot_count_in_path_and_args = path_raw.count('.') + args_raw.count('.')
        capital_count = len([char for char in url_decoded if char.isupper()])

        domain_is_ip_address = int(
            bool(re.match(r'(\d+\.){3}\d+', domains_raw)))
        contain_suspicious_symbol = int(
            args_raw.find('\\') >= 0 or args_raw.find(':') >= 0)

        protocol, domains, path, args = url_data
        sub_domains, main_domain, domain_ending = domains

        contains_at_symbol = int(len(path) > 0 and path[-1] == '@')
        is_https = int(protocol == 'https')
        num_main_domain_words = len(main_domain)
        num_sub_domains = len(sub_domains)
        is_www = int(num_sub_domains > 0 and sub_domains[0] == 'www')
        is_www_weird = int(num_sub_domains > 0
                           and bool(re.match(r'www.+', sub_domains[0])))
        num_path_words = len(path) - contains_at_symbol
        domain_end_verdict = int(domain_ending in UNTRUSTWORTHY_TLDS)

        sub_domain_chars = flatten(sub_domains)
        sub_domains_num_digits = len(
            [char for char in sub_domain_chars if char.isdigit()])

        path_chars = flatten(path)
        path_num_digits = len([char for char in path_chars if char.isdigit()])

        args_flat = flatten_twice(args)
        args_chars = flatten(args_flat)
        args_num_digits = len([char for char in args_chars if char.isdigit()])

        total_num_digits = (sub_domains_num_digits + path_num_digits +
                            args_num_digits)

        word_court_in_url = len(words) - contains_at_symbol

        feat_vec = np.array([
            is_https, num_main_domain_words, num_sub_domains, is_www,
            is_www_weird, num_path_words, domain_end_verdict,
            sub_domains_num_digits, path_num_digits, args_num_digits,
            total_num_digits, contains_at_symbol, word_court_in_url,
            domain_len, path_len, args_len, dot_count_in_path_and_args,
            capital_count, domain_is_ip_address, contain_suspicious_symbol
        ])
        return feat_vec
Esempio n. 5
0
 def test_flatten_tuple(self):
     assert flatten_twice([[(1, 1)], [(1, ), (1, 1)]]) == [1, 1, 1, 1, 1]
Esempio n. 6
0
 def test_flatten_lst(self):
     assert flatten_twice([[[1], [1]], [[1], [1, 1]]]) == [1, 1, 1, 1, 1]