def __create_positional_word_matrix__(self, url_data: UrlData) \ -> np.ndarray: ''' Takes the url_date and creates a full word embedding from this of shape (N, embedding_dim), where N is equal to the sum of the lengths of the sub_domains, main_domains, paths, args + 1 for the TLD. The embedding will be positional, meaning it will have the embed vectors for each area of the URL starting at fixed locations and truncated to fixed lengths. I.e. [0, 0, x, y, 0, 0, d, f, 0, ...] Args: url_data (UrlData): 4-tuple of url data Returns: word_matrix (np.ndarray): Full positional word embedding matrix of shape (N, embedding_dim) ''' _, domains, path, args = url_data sub_domains, main_domain, domain_ending = domains args_flat = flatten_twice(args) sub_domain_mat = self.__word_embed__(sub_domains, self.sub_domain_max_len) main_domain_mat = self.__word_embed__(main_domain, self.main_domain_max_len) domain_end_vec = self.__word_embed__([domain_ending], 1) path_mat = self.__word_embed__(path, self.path_max_len) args_mat = self.__word_embed__(args_flat, self.arg_max_len) word_matrix = np.concatenate([ sub_domain_mat, main_domain_mat, domain_end_vec, path_mat, args_mat ]) return word_matrix
def flatten_url_data(url_data: UrlData) -> List[str]: ''' Helper function to transform the 4-tuple of UrlData returned by url_tokenizer into a simple list of strings. Can be helpful to simplify the problem if the position of words is not relevant. Args: url_data (UrlData): The UrlData 4-tuple returned by url_tokenizer Returns: words (List[str]): A flat list of all the words Examples: >>> url_data = url_tokenizer('http://some.test.com/path') >>> flatten_url_data(url_data) ['http', 'some', 'test', 'com', 'path'] ''' protocol, domains, path, args = url_data sub_domain, main_domain, tld = domains words = ([protocol] + sub_domain + main_domain + [tld] + path + flatten_twice(args)) return words
def __create_sequential_word_matrix__(self, url_data: UrlData) \ -> np.ndarray: ''' Takes the url_date and creates a full word embedding from this of shape (N, embedding_dim), where N is equal to the sum of the lengths of the sub_domains, main_domains, paths, args + 1 for the TLD. The embedding will be sequential, meaning it will have the embed vectors for each word at the beginning and then padded/truncated to fixed length N. I.e. [x, y, z, k, d, v, f, 0, 0, 0, ...] Args: url_data (UrlData): 4-tuple of url data Returns: word_matrix (np.ndarray): Full sequential word embedding matrix of shape (N, embedding_dim) ''' _, domains, path, args = url_data sub_domains, main_domain, domain_ending = domains args_flat = flatten_twice(args) full = sub_domains + main_domain + [domain_ending] + path + args_flat word_matrix = self.__word_embed__(full, self.N) return word_matrix
def __create_hand_picked_features__(self, url: str, url_data: UrlData) -> np.ndarray: ''' Creates hand-picked features based on the url data Args: url (str): URL string url_data (UrlData): 4-tuple of URL data Returns: feat_vec (np.ndarray): 1D vector of hand-picked features ''' words = flatten_url_data(url_data) url_decoded = url_html_decoder(url) protocol, domains_raw, path_raw, args_raw = \ url_raw_splitter(url_decoded) domain_len = len(domains_raw) path_len = len(path_raw) args_len = len(args_raw) dot_count_in_path_and_args = path_raw.count('.') + args_raw.count('.') capital_count = len([char for char in url_decoded if char.isupper()]) domain_is_ip_address = int( bool(re.match(r'(\d+\.){3}\d+', domains_raw))) contain_suspicious_symbol = int( args_raw.find('\\') >= 0 or args_raw.find(':') >= 0) protocol, domains, path, args = url_data sub_domains, main_domain, domain_ending = domains contains_at_symbol = int(len(path) > 0 and path[-1] == '@') is_https = int(protocol == 'https') num_main_domain_words = len(main_domain) num_sub_domains = len(sub_domains) is_www = int(num_sub_domains > 0 and sub_domains[0] == 'www') is_www_weird = int(num_sub_domains > 0 and bool(re.match(r'www.+', sub_domains[0]))) num_path_words = len(path) - contains_at_symbol domain_end_verdict = int(domain_ending in UNTRUSTWORTHY_TLDS) sub_domain_chars = flatten(sub_domains) sub_domains_num_digits = len( [char for char in sub_domain_chars if char.isdigit()]) path_chars = flatten(path) path_num_digits = len([char for char in path_chars if char.isdigit()]) args_flat = flatten_twice(args) args_chars = flatten(args_flat) args_num_digits = len([char for char in args_chars if char.isdigit()]) total_num_digits = (sub_domains_num_digits + path_num_digits + args_num_digits) word_court_in_url = len(words) - contains_at_symbol feat_vec = np.array([ is_https, num_main_domain_words, num_sub_domains, is_www, is_www_weird, num_path_words, domain_end_verdict, sub_domains_num_digits, path_num_digits, args_num_digits, total_num_digits, contains_at_symbol, word_court_in_url, domain_len, path_len, args_len, dot_count_in_path_and_args, capital_count, domain_is_ip_address, contain_suspicious_symbol ]) return feat_vec
def test_flatten_tuple(self): assert flatten_twice([[(1, 1)], [(1, ), (1, 1)]]) == [1, 1, 1, 1, 1]
def test_flatten_lst(self): assert flatten_twice([[[1], [1]], [[1], [1, 1]]]) == [1, 1, 1, 1, 1]