Beispiel #1
0
def addToTable(text, order):
    global table, size
    tokens = word_tokenize(text)
    # Make the index table
    for i in range(len(tokens) - order):
        sub = d.detokenize(tokens[i:i + order])
        if gib_detect_train.avg_transition_prob(sub, model_mat) < threshold:
            continue
        if not sub in table:
            table[sub] = {}
            table[sub]["SIZE"] = 0
            size += 1
    # Count the following strings for each string
    for j in range(len(tokens) - order - order):
        index = d.detokenize(tokens[j:j + order])
        if gib_detect_train.avg_transition_prob(index, model_mat) < threshold:
            continue
        k = j + order
        following = d.detokenize(tokens[k:k + order])
        if gib_detect_train.avg_transition_prob(following,
                                                model_mat) < threshold:
            continue
        if not following in table[index] and len(following) > 0:
            table[index][following] = 1
            table[index]["SIZE"] += 1
        elif len(following) > 0:
            table[index][following] += 1
            table[index]["SIZE"] += 1
Beispiel #2
0
    def check_word_random(self, word):

        if gib_detect_train.avg_transition_prob(
                word, self.model_mat) < self.threshold:
            return 1
        else:
            return 0
def count_gib(labels):
    #Count gibberish labels based on a trained model
    count = 0
    for label in labels:
        if not gib_detect_train.avg_transition_prob(label, model_mat) > threshold:
            count += 1
    return count
def count_gib(labels):
    #Count gibberish labels based on a trained model
    count = 0
    for label in labels:
        if not gib_detect_train.avg_transition_prob(label,
                                                    model_mat) > threshold:
            count += 1
    return count
def markov_score(temp):
    global model_data
    # query = temp
    model_mat = model_data['mat']
    # threshold = model_data['thresh']
    sum = (gib_detect_train.avg_transition_prob(temp, model_mat))

    return sum
Beispiel #6
0
    def parse(self, words):

        keywords_in_words = []
        brands_in_words = []
        similar_to_brands = []
        similar_to_keywords = []
        dga_in_words = []
        len_gt_7 = []
        len_lt_7 = []
        try:
            for word in words:

                word = re.sub("\d+", "", word)

                if word in self.keywords:
                    keywords_in_words.append(word)

                elif word in self.allbrand:
                    brands_in_words.append(word)

                elif self.__is_similar_to_any_element(word,
                                                      self.allbrand) != 0:
                    target = self.__is_similar_to_any_element(
                        word, self.allbrand)
                    similar_to_brands.append(target)

                elif self.__is_similar_to_any_element(word,
                                                      self.keywords) != 0:
                    target = self.__is_similar_to_any_element(
                        word, self.keywords)
                    similar_to_keywords.append(target)

                elif len(word) > 3 and not word.isnumeric():

                    if (gib_detect_train.avg_transition_prob(
                            word, self.model_mat) > self.threshold) == False:
                        dga_in_words.append(word)
                        # todo keyword benzeri olanlar temizlenmeli
                    elif len(word) < 7:
                        len_lt_7.append(word)
                    else:
                        len_gt_7.append(word)

            result = {
                'keywords_in_words': keywords_in_words,
                'brands_in_words': brands_in_words,
                'dga_in_words': dga_in_words,
                'len_lt_7': len_lt_7,
                'len_gt_7': len_gt_7,
                'similar_to_brands': similar_to_brands,
                'similar_to_keywords': similar_to_keywords
            }
        except:
            self.logger.debug(str(words) + " işlenirken hata")
            self.logger.error("Error : {0}".format(format_exc()))

        return result
Beispiel #7
0
def check_gib(temp):
    global model_data
    query = temp

    model_mat = model_data['mat']
    threshold = model_data['thresh']
    sum = (gib_detect_train.avg_transition_prob(temp, model_mat))

    return sum
Beispiel #8
0
def test():
    csv = pd.read_csv('Data.csv')
    domain = csv['domain'].values
    res = []
    model_data = pickle.load(open('gib_model.pki', 'rb'))
    for i in range(len(domain)):
        model_mat = model_data['mat']
        dom = domain[i].split('.')[0]
        print(domain[i])
        print(gib_detect_train.avg_transition_prob(dom, model_mat))
def detect(text):
    model_data = pickle.load(open('helpers/GibberishDetector/gib_model.pki', 'rb'))


    l = text
    # l = raw_input()
    model_mat = model_data['mat']
    threshold = model_data['thresh']
    # print gib_detect_train.avg_transition_prob(l, model_mat) > threshold
    return gib_detect_train.avg_transition_prob(l, model_mat) > threshold
Beispiel #10
0
def check_word(wordList):
    '''Returns count of gibberish present in the list of words'''
    model_data = pickle.load(open('gib_model.pki', 'rb'))
    model_mat = model_data['mat']
    threshold = model_data['thresh']
    count = 0
    for word in wordList:
        l = word
        if gib_detect_train.avg_transition_prob(l, model_mat) <= threshold:
            count += 1
    return count
Beispiel #11
0
    def apply_name_filters(pdf):
        # name length
        pdf['name_len'] = pdf['name'].map(lambda x: len(x))

        # name gibberish score
        name_model_data = pickle.load(open('../private/gib_model.pki', 'rb'))
        name_model_mat = name_model_data['mat']
        gib_score = lambda x: gib_detect_train.avg_transition_prob(
            x, name_model_mat)
        pdf['name_gibberish_score'] = pdf['name'].map(gib_score)

        # name vowel ratio
        pdf['name_vowel_ratio'] = pdf['name'].apply(Preprocessor.vowel_ratio)

        return pdf
def processDecades():
    prevPrevData = []
    prevData = []
    repeats = set()
    for decade in range(1900, 2020, 10):
        print("processing", str(decade))
        directory = os.fsencode("decades_modified/" + str(decade))
        for file in os.listdir(directory):
            filename = os.fsdecode(file)
            readf = codecs.open("decades_modified/" + str(decade) + "/" +
                                filename,
                                'r',
                                encoding="cp1252",
                                errors="replace")
            filedata = readf.read()
            readf.close()
            processed = processText(filedata)
            lines = processed.split("\n")
            reprocessed = ""
            for s in lines:
                if len(s) < 4 or gib_detect_train.avg_transition_prob(
                        s, model_mat) < threshold or "......." in s:
                    pass
                # elif s in prevData or s in prevPrevData:
                #     repeats.add(s)
                else:
                    reprocessed += s + "\n"
            output = reprocessed
            # sents = nltk.sent_tokenize(reprocessed)
            # output = ""
            # for s in sents:
            #     if len(s) < 4 or gib_detect_train.avg_transition_prob(s, model_mat) < threshold:
            #         pass
            #     elif s in prevData or s in prevPrevData:
            #         repeats.add(s)
            #     else:
            #         output += s + " "
            writef = codecs.open("decades_modified/" + str(decade) + "/" +
                                 filename,
                                 'w+',
                                 encoding="utf-8",
                                 errors='ignore')
            writef.write(output)
            writef.close()
Beispiel #13
0
def random_callback_name(flow):
    """ This is the main function that do the work """

    name = None

    if (flow.dns_info):
        name = str(flow.dns_info.domain_name)
    elif (flow.ssl_info):
        name = str(flow.ssl_info.server_name)
    elif (flow.http_info):
        name = str(flow.http_info.host_name)

    name = name[:-4]
    if (name.startswith("www.")):
        name = name[4:]

    if (name):
        value = gib_detect_train.avg_transition_prob(name,
                                                     model_mat) > threshold
        if (value == False):
            print("WARNING:%s:%s Unknown malware detected" %
                  (flow.l7_protocol_name, name))
Beispiel #14
0
    vowel_ratio = count_vowels(main_domain) / f_len
    digit_ratio = count_digits(main_domain) / f_len
    repeat_letter = count_repeat_letter(main_domain) / f_len
    consec_digit = consecutive_digits(main_domain) / f_len
    consec_consonant = consecutive_consonant(main_domain) / f_len

    # probability of staying in the markov transition matrix (trained by Alexa)
    hmm_prob_ = hmm_prob(hmm_main_domain)
    # probability is too low to be non-DGA
    if hmm_prob_ < math.e**hmm_prob_threshold:
        hmm_log_prob = -999.
    else:
        hmm_log_prob = math.log(hmm_prob_)

    # advanced linguistic feature: pronouncable domain
    gib_value = int(
        gib_detect_train.avg_transition_prob(main_domain.strip('$'), model_mat)
        > threshold)
    try:
        fw.write(
            '%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n'
            % (domain, tld, entropy, f_len, entropy / f_len, vowel_ratio,
               digit_ratio, repeat_letter, consec_digit, consec_consonant,
               gib_value, hmm_log_prob, ave(unigram_rank), ave(bigram_rank),
               ave(trigram_rank), std(unigram_rank), std(bigram_rank),
               std(trigram_rank), has_private_tld))
    except UnicodeEncodeError:
        continue

fw.close()
fi.close()
Beispiel #15
0
import pandas as pd
import numpy as np
import pickle
import gib_detect_train

df = pd.read_csv("orig-data.csv")
model_data = pickle.load(open('../private/gib_model.pki', 'rb'))
model_mat = model_data['mat']
gib_score = lambda x : gib_detect_train.avg_transition_prob(x, model_mat)

df['name_len'] = df['display_name'].map(lambda x : len(x))

df['name_gibberish_score'] = df['display_name'].map(gib_score)

def vowel_ratio(x):
    count = 0
    for c in x.lower():
        if c in ['a', 'e', 'i', 'o', 'u']:
            count=count+1
    return float(count)/float(len(x))

df['name_vowel_ratio'] = df['display_name'].apply(vowel_ratio)

import datetime

def get_day_of_week(x):
    date_str = x[:10]
    datetime_obj = datetime.datetime.strptime(date_str, '%Y-%m-%d')
    return datetime_obj.weekday()

Beispiel #16
0
def HMM(domain):
    model_data = pickle.load(open('gib_model.pki', 'rb'))
    model_mat = model_data['mat']
    str = domain.split('.')[0]
    hmm = gib_detect_train.avg_transition_prob(str, model_mat)
    return hmm
Beispiel #17
0
 def score(self,input_str):
     score=gib_detect_train.avg_transition_prob(input_str, self.model_mat, self.threshold)
     isgib=(score>self.threshold)
     return score, isgib
Beispiel #18
0
    # 2. N-Gram Feature

    unigram_rank = np.array(
        [gram_rank_dict[i] if i in gram_rank_dict else 0 for i in host[1:-1]])
    bigram_rank = np.array([
        gram_rank_dict[''.join(i)] if ''.join(i) in gram_rank_dict else 0
        for i in bigrams(host)
    ])  # extract the bigram
    trigram_rank = np.array([
        gram_rank_dict[''.join(i)] if ''.join(i) in gram_rank_dict else 0
        for i in trigrams(host)
    ])  # extract the bigram

    # 3. Gib_value Feature
    gib_value = int(
        gib_detect_train.avg_transition_prob(host, model_mat) > threshold)

    try:
        fw.write(
            '%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n'
            % (
                url,
                is_malicious,
                cctld_num,
                entropy1,
                f_len1,
                entropy1 / f_len1,
                # vowel_ratio,digit_ratio,repeat_letter,consec_digit,consec_consonant,
                ave(unigram_rank),
                ave(bigram_rank),
                ave(trigram_rank),
#!/usr/bin/python

import pickle
import gib_detect_train
import os
import sys

gib_model_file = os.path.join(os.path.dirname(__file__), 'gib_model.pki')
if not os.path.isfile(gib_model_file): gib_detect_train.train()
model_data = pickle.load(open(gib_model_file, 'rb'))
model_mat = model_data['mat']
threshold = model_data['thresh']


def is_gibberish(string):
    return gib_detect_train.avg_transition_prob(string, model_mat,
                                                threshold) <= threshold


if __name__ == "__main__":
    if len(sys.argv) > 1: sys.exit(not is_gibberish(sys.argv[1]))

    print('threshold: ' + str(threshold))
    while True:
        input = raw_input()
        prob = gib_detect_train.avg_transition_prob(input, model_mat,
                                                    threshold)
        print('prob: ' + str(prob))
        print('gibberish? ' + str(is_gibberish(input)))
Beispiel #20
0
__FILENAME__ = gib_detect
#!/usr/bin/python

import pickle
import gib_detect_train

model_data = pickle.load(open('gib_model.pki', 'rb'))

while True:
    l = raw_input()
    model_mat = model_data['mat']
    threshold = model_data['thresh']
    print gib_detect_train.avg_transition_prob(l, model_mat) > threshold

########NEW FILE########
__FILENAME__ = gib_detect_train
#!/usr/bin/python

import math
import pickle

accepted_chars = 'abcdefghijklmnopqrstuvwxyz '

pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])

def normalize(line):
    """ Return only the subset of chars from accepted_chars.
    This helps keep the  model relatively small by ignoring punctuation, 
    infrequenty symbols, etc. """
    return [c.lower() for c in line if c.lower() in accepted_chars]
 def is_word_gibberish(word):
     """Return the result from the training."""
     return gib_detect_train.avg_transition_prob(
         word, model_mat) <= threshold
Beispiel #22
0
def get_candicate_host_domains():
    days = ["20171104"]
    hours = ['%02d' % (hour) for hour in range(24)]
    mins = [
        "00", "05", "10", "15", "20", "25", "30", "35", "40", "45", "50", "55"
    ]
    files = []
    for day in days:
        for hour in hours:
            for minute in mins:
                files.append("/mnt/Work/DNS/data/nxdomains_per_5_mins/" + day +
                             "/" + day + str(hour) + minute + ".txt")

    for file in files:
        print(file)
        day = file.split('/')[-2]
        if not os.path.exists("/mnt/Work/DNS1/results/" + day):
            os.system("mkdir -p /mnt/Work/DNS1/results/" + day)
        file_out = "/mnt/Work/DNS1/results/" + day + "/" + file.split('/')[-1]
        # print(file_out)

        results = defaultdict()
        sip_domains = defaultdict(lambda: set())
        with open(file) as f:
            for row in f:
                items = row.strip().split(',')
                sip = items[0]
                domain = clean_string(items[3])
                if have_other_characters(domain):
                    continue
                if F.in_tld_whitelist(domain):
                    continue
                # if F.in_CDN(domain):
                #     continue
                if F.in_disposable(domain):
                    continue
                if F.in_DHCP(domain):
                    continue
                if not A.in_alexa_top(domain):
                    sip_domains[sip].add(domain)

        count = 0
        for sip, domains in sorted(sip_domains.items(),
                                   key=lambda x: len(x[1]),
                                   reverse=True):
            # main_domains = [tldextract.extract(domain_1).domain for domain_1 in domains]
            # main_domains = ['.'.join(tldextract.extract(domain_1)[:2]).replace('.', '') for domain_1 in domains]
            domain_value = {
                l: int(
                    gib_detect_train.avg_transition_prob(
                        tldextract.extract(l).domain, model_mat) > threshold)
                if len(tldextract.extract(l).domain) > 9 else 1
                for l in domains
            }
            if len([k_v[0]
                    for k_v in domain_value.items() if k_v[1] == 0]) > 2:
                # if Counter([int(gib_detect_train.avg_transition_prob(l, model_mat) > threshold ) if len(l) > 9 else 1 for l in main_domains])[0] > 2:
                # print("{}: {}".format(sip, domains))
                results[sip] = [
                    k_v[0] for k_v in domain_value.items() if k_v[1] == 0
                ]
                count += 1
        print(count)

        for key in results:
            results[key] = sorted(results[key], key=lambda x: len(x))

        with open(file_out, "w") as f_out:
            json.dump(results, f_out, sort_keys=True, indent=4)
Beispiel #23
0
list_topics = list_topics.split(",")

degree = []
pure_degree = []
for i in range(len(list_topics)):
    if ("h" in list_topics[i]):
        res = re.sub("\d+", '', list_topics[i])
        pattern = re.compile("[^\w ]")
        vare = pattern.sub('', res)

        j = vare.replace("h", '', 1)
        if (len(j) > 2):

            model_mat = model_data['mat']
            threshold = model_data['thresh']
            boz = gib_detect_train.avg_transition_prob(j,
                                                       model_mat) > threshold

            if (boz == True):
                j = " ".join(j.split())
                degree.append(j)

d = dog("bolt://localhost:7687", "neo4j", "mathers22")
degree = list(dict.fromkeys(degree))
print(degree)
subject_er = 'Electromagnetics'
subject_e = wikipedia.search(subject_er)[0]
print(subject_e)
# d.add_person(subject_e,degree)

while ("" in degree):
    degree.remove("")
    for s in lines:
        j = 0
        for j in range(0, len(s)):
            if s[j] == '.':
                break
        temp = s[0:j]




        query = temp


        model_mat = model_data['mat']
        threshold = model_data['thresh']
        sum=(gib_detect_train.avg_transition_prob(temp, model_mat))

        outF.write(str(sum))

        outF.write("\n")
        count=count+1
        print(count)

        if count == 1000:
            break

outF.close()



    #linguistic feature: % of vowels, % of digits, % of repeated letter, % consecutive digits and % non-'aeiou'
    vowel_ratio = count_vowels(main_domain)/f_len
    digit_ratio = count_digits(main_domain)/f_len
    repeat_letter = count_repeat_letter(main_domain)/f_len
    consec_digit = consecutive_digits(main_domain)/f_len
    consec_consonant = consecutive_consonant(main_domain)/f_len

    #probability of staying in the markov transition matrix (trained by Alexa)
    hmm_prob_ = hmm_prob(hmm_main_domain)
    if hmm_prob_<math.e**hmm_prob_threshold:#probability is too low to be non-DGA
        hmm_log_prob = -999.
    else:
        hmm_log_prob = math.log(hmm_prob_)

    #advanced linguistic feature: pronouncable domain
    gib_value = int(gib_detect_train.avg_transition_prob(main_domain.strip('$'), model_mat) > threshold)
    try:
        fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n'
                %(domain,cla,tld,entropy,f_len,entropy/f_len,vowel_ratio,
                digit_ratio,repeat_letter,consec_digit,consec_consonant,gib_value,hmm_log_prob,
                ave(unigram_rank),ave(bigram_rank),ave(trigram_rank),
                std(unigram_rank),std(bigram_rank),std(trigram_rank),
                has_private_tld)
                )
    except UnicodeEncodeError:
        continue
    #fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n'
    #        %(domain,cla,tld,entropy,f_len,entropy/f_len,unigram_rank.mean(),bigram_rank.mean(),trigram_rank.mean(),
    #        unigram_rank.std(),bigram_rank.std(),trigram_rank.std()))
    #fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%s\t%s\n'%(domain,cla,tld,entropy,f_len,entropy/f_len,','.join(bigram),','.join(trigram)))
Beispiel #26
0
#!/usr/bin/python

import pickle
import gib_detect_train

model_data = pickle.load(open('gib_model.pki', 'rb'))

# while True:
#     # l = raw_input()
model_mat = model_data['mat']
threshold = model_data['thresh']

print threshold
print gib_detect_train.avg_transition_prob("000000000027", model_mat)
Beispiel #27
0
	def getRating(self, string):
		return gib_detect_train.avg_transition_prob(string, self.model_mat) > self.threshold
def is_gibberish(string):
    return gib_detect_train.avg_transition_prob(string, model_mat,
                                                threshold) <= threshold
Beispiel #29
0
	def getScore(self, string):
		return round(gib_detect_train.avg_transition_prob(string, self.model_mat), 2)
    # linguistic feature: % of vowels, % of digits, % of repeated letter, % consecutive digits and % non-'aeiou'
    vowel_ratio = count_vowels(main_domain)/f_len
    digit_ratio = count_digits(main_domain)/f_len
    repeat_letter = count_repeat_letter(main_domain)/f_len
    consec_digit = consecutive_digits(main_domain)/f_len
    consec_consonant = consecutive_consonant(main_domain)/f_len

    # probability of staying in the markov transition matrix (trained by Alexa)
    hmm_prob_ = hmm_prob(hmm_main_domain)
    if hmm_prob_<math.e**hmm_prob_threshold:  # probability is too low to be non-DGA
        hmm_log_prob = -999.
    else:
        hmm_log_prob = math.log(hmm_prob_)

    # advanced linguistic feature: pronouncable domain
    gib_value = int(gib_detect_train.avg_transition_prob(main_domain.strip('$'), model_mat) > threshold)
    try:
        fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n'
                %(domain,cla,tld,entropy,f_len,entropy/f_len,vowel_ratio,
                digit_ratio,repeat_letter,consec_digit,consec_consonant,gib_value,hmm_log_prob,
                ave(unigram_rank),ave(bigram_rank),ave(trigram_rank),
                std(unigram_rank),std(bigram_rank),std(trigram_rank),
                has_private_tld)
                )
    except UnicodeEncodeError:
        continue
    # fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n'
    #        %(domain,cla,tld,entropy,f_len,entropy/f_len,unigram_rank.mean(),bigram_rank.mean(),trigram_rank.mean(),
    #        unigram_rank.std(),bigram_rank.std(),trigram_rank.std()))
    # fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%s\t%s\n'%(domain,cla,tld,entropy,f_len,entropy/f_len,','.join(bigram),','.join(trigram)))
Beispiel #31
0
        if  str(line)[i].isdigit():
              if  str(line)[i+1].isdigit():
                        consdig += 1
        if str(line)[i].isalpha() and str(line)[i] not in 'aeiouAEIOU':
            if str(line)[i+1].isalpha() and str(line)[i+1] not in 'aeiouAEIOU':
                   conscon += 1

                   
                   
    consonantratio.append(conscon/float(leng))
    consletterratio.append(consletter/float(leng))
    consdigitratio.append(consdig/float(leng))
 
    model_mat = model_data['mat']
    threshold = model_data['thresh']
    if gib_detect_train.avg_transition_prob(str(line)[2:leng+2], model_mat) > threshold:
        pronounciation.append('1')
    else:
        pronounciation.append('0')
 
csvfile2 = file('result_train_'+str(para)+'.csv', 'wb')
if str(para) == 'black':
   x= 0
else:
   x = 1
writer = csv.writer(csvfile2)
#writer.writerow(['domain', 'length', 'entrophy','Pronunciation','Vowel ratio','Digit ratio','Repeat letter', 'Consecutive digit ratio', 'Consecutive consonant ratio','N-gram score w','N-gram score d','Q']) 
i=0;
while(i<len(name)): 
   writer.writerow([ name[i], length[i], entropy[i],pronounciation[i], vowelratio[i],digitratio[i],consletterratio[i],consdigitratio[i],consonantratio[i],float(ngramwhite[i]/10000),float(ngramdict[i]/10000),x])
   i=i+1;
Beispiel #32
0
__FILENAME__ = gib_detect
#!/usr/bin/python

import pickle
import gib_detect_train

model_data = pickle.load(open('gib_model.pki', 'rb'))

while True:
    l = raw_input()
    model_mat = model_data['mat']
    threshold = model_data['thresh']
    print gib_detect_train.avg_transition_prob(l, model_mat) > threshold

########NEW FILE########
__FILENAME__ = gib_detect_train
#!/usr/bin/python

import math
import pickle

accepted_chars = 'abcdefghijklmnopqrstuvwxyz '

pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])


def normalize(line):
    """ Return only the subset of chars from accepted_chars.
    This helps keep the  model relatively small by ignoring punctuation, 
    infrequenty symbols, etc. """
    return [c.lower() for c in line if c.lower() in accepted_chars]
def get_feature(domain):
    arr = domain.split('\n')
    domain = arr[0]
    lens = len(domain)
    separator = 0.0
    bt_alpha = 0.0
    max_alpha = 0.0
    digit = 0.0
    bt_digit = 0.0
    max_digit = 0.0
    special = 0.0
    trans = 0.0
    bt_separator = 0.0
    bt = 0.0
    flag = 0
    upper = 0.0
    hasip = 0.0
    for i in range(lens):
        try:
            x = domain[i]
            #print x
            bt = bt + 1
            if (bt_alpha > max_alpha):
                max_alpha = bt_alpha
            if (bt_digit > max_digit):
                max_digit = bt_digit

            if (x == '-'):
                bt_alpha = 0.0
                bt_digit = 0.0
                separator = separator + 1
                if (bt - 1 > bt_separator and flag == 1):
                    bt_separator = bt - 1
                bt = 0.0
                flag = 1
            elif (x.isalpha()):
                bt_alpha = bt_alpha + 1
                bt_digit = 0

            elif (x.isdigit()):
                digit = digit + 1
                bt_digit = bt_digit + 1
                bt_alpha = 0.0
                j = i + 1
                while (j <= lens) and (domain[j].isdigit()
                                       or domain[j] == '.'):
                    j = j + 1
                    if checkip(domain[i:j]):
                        hasip = 1.0

            elif (not (x == '.')):
                #print x
                bt_alpha = 0.0
                bt_digit = 0.0
                special = special + 1
            else:
                bt_alpha = 0.0
                bt_digit = 0.0
            if (x.isupper()):
                upper = upper + 1
            if ((i >= 1) and (not (x == '.'))):
                j = i - 1
                while (domain[j] == '.'):
                    j = j - 1
                if ((x.isalpha() and domain[j].isdigit())
                        or (x.isdigit() and domain[j].isalpha())):
                    trans = trans + 1
        except:
            print 'URLError:' + domain
    f_len = float(len(domain))
    count = Counter(i for i in domain).most_common()
    entropy = -sum(j / f_len * (math.log(j / f_len)) for i, j in count)
    model_data = pickle.load(open('gib_model.pki', 'rb'))
    model_mat = model_data['mat']
    threshold = model_data['thresh']
    gib_value = int(
        gib_detect_train.avg_transition_prob(domain, model_mat) > threshold)

    if (not lens == 0):
        rates = float(digit) / float(lens)
        trans_rates = float(trans) / float(lens)
    else:
        rates = 0.0
        trans_rates = 0.0
    return (float(lens), hasip, entropy, separator, special, digit, rates,
            trans_rates, upper, bt_separator, max_digit, max_alpha,
            float(gib_value))
Beispiel #34
0
#!/usr/bin/python

import pickle
import gib_detect_train

model_data = pickle.load(open('gib_model.pki', 'rb'))
model_mat = model_data['mat']
threshold = model_data['thresh']
print "threshold: ", threshold

while True:
    l = raw_input()
    result = gib_detect_train.avg_transition_prob(l, model_mat)
    print(result)
    print result > threshold


Beispiel #35
0
	def main(self):
		#初始化列表
		self.src_ip_list = []
		self.resultList_dga = []
		self._list_ = []
		self.resultList = []


		#生成数据,提取源IP
		src_ip_aggs = self.es.search(index=self.index, body=self.body_aggs, request_timeout = 3600)
		src_ip_list_buckets = src_ip_aggs['aggregations']['group_by_src_ip']['buckets']

		for _ip in src_ip_list_buckets:
			self.src_ip_list.append(_ip.get('key_as_string'))
		print ("srcip_num = ", len(self.src_ip_list))

		#初始化ngram数据
		n_gram_file = open('n_gram_rank_freq.txt','r')
		gram_rank_dict = dict()
		for i in n_gram_file:
		    cat,gram,freq,rank = i.strip().split(',')
		    gram_rank_dict[gram]=int(rank)
		n_gram_file.close()

		cc = 0

		#生成数据,提取DNS查询事件
		for src_ip in self.src_ip_list:
			print (src_ip)
			DNS_query = self.es_search(src_ip)['aggregations']['group_by_domain_name']['buckets']
			init_DNS_query = []
			domain_doc_count = []
			tup_len_format_list = []
			src_domain_list = []

			for _dns in DNS_query:
				init_DNS_query.append(_dns.get('key'))
				domain_doc_count.append(_dns.get('doc_count'))

			init_DNS_query_count = len(init_DNS_query)
			cc = cc+init_DNS_query_count

			for _key in range(init_DNS_query_count):
				domain_name = init_DNS_query[_key].lower()

				#计算n_gram排名
				bigram = [''.join(i) for i in self.bigrams(domain_name)]
				trigram = [''.join(i) for i in self.trigrams(domain_name)]

				#提取bigram
				bigram_rank = np.array([gram_rank_dict[''.join(i)] if ''.join(i) in gram_rank_dict else 0 for i in self.bigrams(domain_name)])

				#提取trigram
				trigram_rank = np.array([gram_rank_dict[''.join(i)] if ''.join(i) in gram_rank_dict else 0 for i in self.trigrams(domain_name)])

				#随机性熵值
				f_len = float(len(domain_name))
				shannon_count = Counter(i for i in domain_name).most_common()
				entropy = -sum(j/f_len*(math.log(j/f_len)) for i,j in shannon_count)

				#元音字母可读性计算
				model_mat = self.model_data['mat']
				threshold = self.model_data["thresh"]
				git_bool = gib_detect_train.avg_transition_prob(domain_name, model_mat) > threshold

				#域名可信度判断
				if entropy > 2.2 and git_bool == False and (self.ave(bigram_rank) > 250 or self.ave(trigram_rank) > 3000):
					#提取查询次数超过50次的域名
					if domain_doc_count[_key] > 100:
						#域名的长度
						_len = len(domain_name)
						#域名的格式
						_format = self.format(domain_name)
						#域名长度和格式元组
						_tup = (_len,_format)
						tup_len_format_list.append(_tup)
						tup_src_domain = (src_ip , domain_name)
						src_domain_list.append(tup_src_domain)
						test_dga = list(set(src_domain_list))	

			len_tup_len_format_list = len(tup_len_format_list)

			#提取长度和格式相同的域名
			for x in tup_len_format_list:
				c = 0
				initList_dga = []
				y = 0
				while y < len_tup_len_format_list:
					if x == tup_len_format_list[y]:
						c = c + 1
						initList_dga.append(src_domain_list[y])
					if c == 6:
						self.resultList_dga = self.resultList_dga + initList_dga
						break
					y = y + 1


		#列表去重
		self.resultList_dga = list(set(self.resultList_dga))


		#8月3号更新,新增判断域名是否为拼音或拼音首字母组成,匹配则丢弃
		for r_line in open(self.path+'sougou_db.txt','r'):
			r_line = r_line.rstrip("\n")
			self._list_.append(r_line)
		for _string in self.resultList_dga:
			self.pinyin_or_word(_string)

		print ('all=',cc)

		#输出结果
		if len(self.resultList):
			print ("#######################")
			print ('DGA detected!')
			print ("#######################")
			print (self.resultList)
			print ("domain len is:" , len(self.resultList))
			print ("#######################")
		else :
			print ("no DGA detected")


		#8月2号更新:whois查询
		non_whois_count = 0
		whois_count = 0
		error_count = 0
		for _init_whois in self.resultList:
			result = ""
			_whois_domain = _init_whois[1]
			try:
				result = whois.whois(_whois_domain)["domain_name"]
			except whois.parser.PywhoisError:
				print ("PywhoisError")
				error_count = error_count +1
			except KeyError:
				print ("KeyError")
				error_count = error_count +1
			except socket.timeout:
				print ("Timeout")
				error_count = error_count +1
			except ConnectionResetError:
				print ("ConnectionResetError")
				error_count = error_count +1
			except socket.gaierror:
				print ("socket.gaierror")
				error_count = error_count +1
			if result == None or result == "":
				non_whois_count = non_whois_count +1
		#		self.resultList.append(_init_whois)
			else:
				whois_count = whois_count +1
		print ("non-exist whois count is:", non_whois_count)
		print ("error count is:", error_count)
		print ("exist whois count is:", whois_count)
		print ("exist whois percent is:", whois_count/len(self.resultList))