def featureExtractor(text): sparseVec = [0] * 10 sparseVec[0] = isThirdPerson(text) #['thirdPersonLanguage'] sparseVec[1] = hasFirstPersonPlural(text) #'firstPersonPlurals' sparseVec[2] = shannonEntropy(text) #'kolmogorovComplexity' sparseVec[3] = multipleVictimsAdvertised( text) #'multipleVictimsAdvertised' sparseVec[4] = containsCountriesOfInterest( text) #'containsCountriesOfInterest' sparseVec[5] = multipleVictimsAdvertised( text) #'multipleVictimsAdvertised' sparseVec[6] = victimWeightMentioned(text) #'victimWeightMentioned' sparseVec[7] = presenceOfPhrasesAndWords( text) #'presenceOfPhrasesAndWords' sparseVec[8] = presenceOfSpa(text) #'presenceOfSpa' sparseVec[9] = textHasEmoji(text) #'textHasEmoji' sparseVec.extend(tf_idf(text)) # TF-IDF top n vals return sparseVec
def isVeryLikelyEncrypted(dataBytes): entropyValue = entropy.shannonEntropy(dataBytes) if len(dataBytes)<=512 else entropy.gzipEntropy(dataBytes) return entropyValue>=0.9
for f in fi: url, is_malicious, cctld, cctld_num = f.strip().split('\t') # host host = url.strip().split('/')[0] # host = 'www.ourcrazyveterans.com' host = host.lower() ext = tldextract.extract(url) core_domain = '$' + ext.domain.lower() + '$' # core_domain = "$google$" # 1. Shannon entropy feature # 1.1 url # new_url = url.strip().split('://')[1] # 'http://forum.woltlab.fr/commu/path1' # new_url = new_url[:-1] f_len1, count1, entropy1 = shannonEntropy( url) # 'forum.woltlab.fr/commu/path1' # 1.2 host # f_len2, count2, entropy2 = shannonEntropy(host) # forum.woltlab.fr # 1.3 core domainn # f_len3, count3, entropy3 = shannonEntropy(core_domain) # woltlab # vowel_ratio = count_vowels(core_domain)/f_len # digit_ratio = count_digits(core_domain)/f_len # repeat_letter = count_repeat_letter(core_domain)/f_len # consec_digit = consecutive_digits(core_domain)/f_len # consec_consonant = consecutive_consonant(core_domain)/f_len # 2. N-Gram Feature