def featureExtractor(text):
    sparseVec = [0] * 10
    sparseVec[0] = isThirdPerson(text)  #['thirdPersonLanguage']
    sparseVec[1] = hasFirstPersonPlural(text)  #'firstPersonPlurals'
    sparseVec[2] = shannonEntropy(text)  #'kolmogorovComplexity'
    sparseVec[3] = multipleVictimsAdvertised(
        text)  #'multipleVictimsAdvertised'
    sparseVec[4] = containsCountriesOfInterest(
        text)  #'containsCountriesOfInterest'
    sparseVec[5] = multipleVictimsAdvertised(
        text)  #'multipleVictimsAdvertised'
    sparseVec[6] = victimWeightMentioned(text)  #'victimWeightMentioned'
    sparseVec[7] = presenceOfPhrasesAndWords(
        text)  #'presenceOfPhrasesAndWords'
    sparseVec[8] = presenceOfSpa(text)  #'presenceOfSpa'
    sparseVec[9] = textHasEmoji(text)  #'textHasEmoji'
    sparseVec.extend(tf_idf(text))  # TF-IDF top n vals
    return sparseVec
def isVeryLikelyEncrypted(dataBytes):
    entropyValue = entropy.shannonEntropy(dataBytes) if len(dataBytes)<=512 else entropy.gzipEntropy(dataBytes)
    return entropyValue>=0.9
Example #3
0
for f in fi:
    url, is_malicious, cctld, cctld_num = f.strip().split('\t')

    # host
    host = url.strip().split('/')[0]  # host = 'www.ourcrazyveterans.com'
    host = host.lower()

    ext = tldextract.extract(url)
    core_domain = '$' + ext.domain.lower() + '$'  # core_domain = "$google$"

    # 1. Shannon entropy feature

    # 1.1 url
    # new_url = url.strip().split('://')[1]               # 'http://forum.woltlab.fr/commu/path1'
    # new_url = new_url[:-1]
    f_len1, count1, entropy1 = shannonEntropy(
        url)  # 'forum.woltlab.fr/commu/path1'

    # 1.2 host
    # f_len2, count2, entropy2 = shannonEntropy(host)        # forum.woltlab.fr

    # 1.3 core domainn
    # f_len3, count3, entropy3 = shannonEntropy(core_domain) # woltlab

    # vowel_ratio = count_vowels(core_domain)/f_len
    # digit_ratio = count_digits(core_domain)/f_len
    # repeat_letter = count_repeat_letter(core_domain)/f_len
    # consec_digit = consecutive_digits(core_domain)/f_len
    # consec_consonant = consecutive_consonant(core_domain)/f_len

    # 2. N-Gram Feature