Exemple #1
0
class json2arff:
    def __init__(self):
        self.logger = NsLog("log")

    def convert_for_train(self, features, param):

        # arff convert header
        try:
            ArffStr = '''@relation weka-test\n\n'''

            features_keys_url = list(features[0]['url_features'].keys())
            features_keys_active = []

            if param == '-a':
                features_keys_active = list(
                    features[0]['active_features'].keys())

            for line in features_keys_url:
                ArffStr = ArffStr + '@attribute ' + line + " numeric\n"

            if param == '-a':
                for line in features_keys_active:
                    ArffStr = ArffStr + '@attribute ' + line + " numeric\n"

            ArffStr = ArffStr + '@attribute class {phish, legitimate}' + "\n\n@data\n"
        except:
            self.logger.debug("Hata - Json_to_arff e gelen sample sayısı" +
                              str(len(features)) + "\nurl_feature_keys: " +
                              str(features_keys_url) +
                              "\nactive_features_key: " +
                              str(features_keys_active))
            self.logger.error("Error Arff Header : {0}".format(format_exc()))
        # header son

        for each_domain in features:
            try:
                tmp = ""

                for key in features_keys_url:
                    tmp = tmp + str(each_domain['url_features'][key]) + ","

                if param == '-a':
                    for key_a in features_keys_active:
                        tmp = tmp + str(
                            each_domain['active_features'][key_a]) + ","

                tmp = tmp + each_domain['info']['class'] + "\n"
                ArffStr = ArffStr + tmp
            except:
                self.logger.debug("Arffe çevrilen sample da hata :\n" +
                                  str(each_domain))
                self.logger.error("Error Arff Body : {0}".format(format_exc()))

        return ArffStr

    def convert_for_test(self, features, param):

        #todo active rules a göre güncellenecek

        # arff convert header

        ArffStr = '''@relation weka-test\n\n'''

        features_keys_url = features[0]['url_features'].keys()

        if param == '-dns':
            features_keys_dns = features[0]['dns_features'].keys()

        for line in features_keys_url:
            ArffStr = ArffStr + '@attribute ' + line + " numeric\n"

        if param == '-dns':
            for key in features_keys_dns:
                ArffStr = ArffStr + '@attribute ' + key + " numeric\n"

        ArffStr = ArffStr + "\n@data\n"

        # header son

        for each_domain in features:
            tmp = ""

            for key in features_keys_url:
                tmp = tmp + str(each_domain['url_features'][key]) + ","

            if param == '-dns':
                for key_dns in features_keys_dns:
                    tmp = tmp + str(each_domain['dns_features'][key_dns]) + ","

            tmp = tmp[0:len(tmp) - 1] + "\n"
            ArffStr = ArffStr + tmp

        return ArffStr

    def convert_for_NLP_without_features(self, features):
        # arff convert header
        try:
            ArffStr = '''@relation weka-test\n\n'''

            ArffStr += '@attribute words string\n'

            ArffStr += '@attribute class {phish, legitimate}' + "\n\n@data\n"

            for sample in features:
                ArffStr += "'"
                for word in sample['info']['nlp_info']['words_nlp']:
                    ArffStr += word + " "
                ArffStr = ArffStr.strip() + "',{0}\n".format(
                    sample['info']['class'])

        except:
            self.logger.error("Error Arff Header : {0}".format(format_exc()))

        return ArffStr

    def convert_for_NLP_with_features(self, features):
        # arff convert header
        try:
            features_keys_url = list(features[0]['url_features'].keys())

            ArffStr = '''@relation weka-test\n\n'''

            ArffStr += '@attribute words string\n'

            for line in features_keys_url:
                ArffStr = ArffStr + '@attribute ' + line + " numeric\n"

            ArffStr += '@attribute class {phish, legitimate}' + "\n\n@data\n"

            for sample in features:
                ArffStr += '"'
                for word in sample['info']['nlp_info']['words_nlp']:
                    ArffStr += word + " "

                ArffStr = ArffStr.strip() + '",'

                for key in features_keys_url:
                    ArffStr += str(sample['url_features'][key]) + ","
                ArffStr = ArffStr.strip() + '{0}\n'.format(
                    sample['info']['class'])

        except:
            self.logger.error("Error Arff Header : {0}".format(format_exc()))

        return ArffStr
Exemple #2
0
class url_rules:
    def __init__(self):

        print("initializing")

        self.logger = NsLog("log")
        self.path_data = "data/"
        self.name_brand_file = "allbrand.txt"
        self.path_alexa_files = "../data/alexa-tld/"

        self.nlp_manager = nlp_class()
        self.pp = pprint.PrettyPrinter(indent=4)
        self.word_splitter = WordSplitterClass()

        allbrand_txt = open(self.path_data + self.name_brand_file, "r")
        self.allbrand = self.__txt_to_list(allbrand_txt)

        #  trie
        #self.trie_alexa_tld = pygtrie.CharTrie(json.loads(open("constant/alexa_tld_json.txt", "r").read()))
        #self.trie_alexa_tldsiz = pygtrie.CharTrie(json.loads(open("constant/alexa_tldsiz_dct.json", "r").read()))

    def __txt_to_list(self, txt_object):

        list = []

        for line in txt_object:
            list.append(line.strip())

        txt_object.close()

        return list

    def rules_main(self, domain, tld, subdomain, path, words_raw):

        features = {}
        info_nlp = {}

        try:
            features.update(self.digit_count(domain, subdomain,
                                             path))  # digitcount
            features.update(self.length(domain, subdomain, path))  # uzunluk
            features.update(self.tld_check(tld))  # tld check
            features.update(self.check_rule_5(words_raw))  # www-com
            features.update(self.punny_code(domain))  # punnycode
            features.update(self.random_domain(domain))  # random_domain
            features.update(self.subdomain_count(subdomain))  # subdomain count
            features.update(self.char_repeat(words_raw))  # char_repeat
            # features.update(self.alexa_check(domain, tld))                         # alexa1m  check
            #features.update(self.alexa_trie(domain, tld))                         # alexa1m check trie
            features.update(self.special_chars(domain, subdomain,
                                               path))  # - . / @
            features.update(self.check_domain_in_list(domain))

            result_nlp = self.nlp_features(words_raw)
            features.update(result_nlp['features'])  # words_info

            info_nlp = result_nlp['info']

        except:
            self.logger.error("url_rules.main() Error : {0}".format(
                format_exc()))

        return info_nlp, features

    def digit_count(self, domain, subdomain, path):

        result = {
            'domain_digit_count': 0,
            'subdomain_digit_count': 0,
            'path_digit_count': 0
        }

        for letter in domain:
            if letter.isdigit():
                result['domain_digit_count'] = result['domain_digit_count'] + 1

        for letter in subdomain:
            if letter.isdigit():
                result['subdomain_digit_count'] = result[
                    'subdomain_digit_count'] + 1

        for letter in path:
            if letter.isdigit():
                result['path_digit_count'] = result['path_digit_count'] + 1

        return result

    def length(self, domain, subdomain, path):

        domain_uzunluk = len(domain)
        subdomain_uzunluk = len(subdomain)
        path_uzunluk = len(path)

        result = {}

        result['domain_length'] = domain_uzunluk
        result['subdomain_length'] = subdomain_uzunluk
        result['path_length'] = path_uzunluk

        return result

    def tld_check(self, tld):

        common_tld = ["com", "org", "net", "de", "edu", "gov"]

        result = {}

        if tld in common_tld:
            result["isKnownTld"] = 1
        else:
            result["isKnownTld"] = 0

        return result

    def check_rule_5(self, words_raw):

        result = {'www': 0, "com": 0}

        for word in words_raw:
            if not word.find('www') == -1:
                result['www'] = result['www'] + 1

            if not word.find('com') == -1:
                result['com'] = result['com'] + 1

        return result

    def punny_code(self, line):

        result = {}

        if line.startswith("xn--"):

            result['punnyCode'] = 1
            return result

        else:
            result['punnyCode'] = 0
            return result

    def random_domain(self, domain):

        result = {'random_domain': self.nlp_manager.check_word_random(domain)}

        return result

    def subdomain_count(self, line):

        sub = line.split(".")

        result = {}
        result['subDomainCount'] = len(sub)

        return result

    def __all_same(self, items):
        return all(x == items[0] for x in items)

    def char_repeat(self, words_raw):

        result = {'char_repeat': 0}
        repeat = {'2': 0, '3': 0, '4': 0, '5': 0}
        part = [2, 3, 4, 5]

        "sliding window mantigi repeat sayisi kadar eleman al" \
        "hepsi ayni mi diye bak - ayni ise artir"

        for word in words_raw:
            for char_repeat_count in part:
                for i in range(len(word) - char_repeat_count + 1):
                    sub_word = word[i:i + char_repeat_count]
                    if self.__all_same(sub_word):
                        repeat[str(char_repeat_count
                                   )] = repeat[str(char_repeat_count)] + 1

        result['char_repeat'] = sum(list(repeat.values()))

        return result

    def alexa_check(self, domain, tld):

        is_find_tld = 0
        is_find = 0
        line = domain + "." + tld

        letter = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "r", "s", "t", "u", "v", "y", "z", "w", "x", "q",
            "1", "2", "3", "4", "5", "6", "7", "8", "9", "0"
        ]

        try:
            if line[0] in letter:
                alexa_txt = open(
                    "{0}{1}.txt".format(self.path_alexa_files, line[0]), "r")
                alexaList_tld = []  #tldli
                alexa_list = []  #tldsiz

                for alexa_line in alexa_txt:
                    alexaList_tld.append(alexa_line.strip())
                    alexa_list.append(alexa_line.strip().split(".")[0])
                alexa_txt.close()

                for alexa_line in alexaList_tld:
                    if line.strip() == alexa_line.strip():
                        is_find_tld = 1
                        break

                for alexa_line in alexa_list:
                    line_domain = line.split(".")[0]
                    if line_domain.strip() == alexa_line.strip():
                        is_find = 1
                        break
        except:
            self.logger.debug(line + "işlenirken hata uzunluktan dolayı")
            self.logger.error(
                "url_rules.check_rule_11()-Alexa  /  Error : {0}".format(
                    format_exc()))

        result = {}

        if is_find_tld == 1:
            result['alexa1m_tld'] = 1
        else:
            result['alexa1m_tld'] = 0

        if is_find == 1:
            result['alexa1m'] = 1
        else:
            result['alexa1m'] = 0

        return result

    def alexa_trie(self, domain, tld):

        line = domain + "." + tld

        result = {}

        try:
            #if self.alexa1mm[line[0].lower()].has_key(line):
            if self.trie_alexa_tld.has_key(line):
                result['alexa1m_tld_trie'] = 1
            else:
                result['alexa1m_tld_trie'] = 0

            if self.trie_alexa_tldsiz.has_key(domain):
                result['alexa1m_tldsiz_trie'] = 1
            else:
                result['alexa1m_tldsiz_trie'] = 0
        except:
            self.logger.debug(line + "işlenirken alexa")
            self.logger.error(
                "url_rules.check_rule_11()-Alexa  /  Error : {0}".format(
                    format_exc()))

        return result

    def special_chars(self, domain, subdomain, path):

        special_char = {
            '-': 0,
            ".": 0,
            "/": 0,
            '@': 0,
            '?': 0,
            '&': 0,
            '=': 0,
            "_": 0
        }
        special_char_letter = special_char.keys()

        for l in domain:
            if l in special_char_letter:
                special_char[l] = special_char[l] + 1

        for l in subdomain:
            if l in special_char_letter:
                special_char[l] = special_char[l] + 1

        for l in path:
            if l in special_char_letter:
                special_char[l] = special_char[l] + 1

        return special_char

    def check_domain_in_list(self, domain):

        result = {}
        if domain in self.allbrand:
            result['domain_in_brand_list'] = 1
        else:
            result['domain_in_brand_list'] = 0

        return result

    def nlp_features(self, words_raw):
        """
        keywords_in_words, brands_in_words,
        dga_in_words, len_lt_7, len_gt_7 
        """
        grouped_words = self.nlp_manager.parse(words_raw)
        splitted_words = self.word_splitter._splitl(grouped_words['len_gt_7'])
        """
        found_keywords, found_brands,
        similar_to_keyword, similar_to_brand,
        other_words, target_words
        """

        fraud_analyze_result = self.nlp_manager.fraud_analysis(
            grouped_words, splitted_words)

        result = self.nlp_manager.evaluate(grouped_words, fraud_analyze_result,
                                           splitted_words)
        split = {'raw': grouped_words['len_gt_7'], 'splitted': splitted_words}
        result['info']['compoun_words'] = split

        return result
Exemple #3
0
class machine_learning_algorithm():

    def __init__(self, algorithm, train_data_name="gsb.arff"):

        self.logger = NsLog("log")

        self.path_output_arff = "../output/arff/"
        self.path_test_output = ""

        self.json2arff_object = json2arff()
        self.parser_object = domain_parser()
        self.train_data_name = train_data_name
        self.rule_calculation = rule_extraction()

        self.train, self.test, self.train_label, self.test_label = self.split_test_and_train_data()

        self.time_now = str(datetime.datetime.now())[0:19].replace(" ", "_")

        if algorithm == 'NB':
            self.model = self.create_model_NB()
            self.active_model = self.active_create_model_NB()
        elif algorithm == 'RF':
            self.model = self.create_model_RF()
            self.active_model = self.active_create_model_RF()
        elif algorithm == 'GB':
            self.model = self.create_model_GB()
            self.active_model = self.active_create_model_GB()

    def txt_to_list(self, txt_object):

        lst = []

        for line in txt_object:
            lst.append(line.strip())

        txt_object.close()

        return lst

    def split_test_and_train_data(self):
        try:
            data , label = self.preparing_train_data()
            train,test,train_label,test_label = train_test_split(data,label,test_size=0.25,random_state=42)
        except:
            self.logger.debug(file_name+" ile train ve test ayırma sırasında hata")
            self.logger.error("Error : {0}".format(format_exc()))

        return train,test,train_label,test_label

    def preparing_train_data(self, file_name="gsb.arff"):

        train = []
        target = []

        try:
            train_dataset, train_meta = arff.loadarff(open("{0}{1}".format(self.path_output_arff, file_name), "r"))

            train = train_dataset[train_meta.names()[:-1]]  # everything but the last column
            target = train_dataset[train_meta.names()[len(train_meta.names()) - 1]]  # last column

            train = np.asarray(train.tolist(), dtype=np.float32)  # olay burda
        except:
            self.logger.debug(file_name+" ile eğitim sırasında hata")
            self.logger.error("Error : {0}".format(format_exc()))

        return train, target

    def preparing_test_data(self, test_dataset_list):
        try:
            feat_json = open("../output/test-output/json-"+self.time_now+".txt", "w")
            feat_arff = open("../output/test-output/arff-"+self.time_now+".arff", "w")

            "domain_parsed to json without class"
            self.test_parsed_domains = self.parser_object.parse_nonlabeled_samples(test_dataset_list)

            "rule calculation for test samples without class information -- output json format"
            test_features = self.rule_calculation.extraction(self.test_parsed_domains)

            "test sampleları için oluşturulan json -> arff e dönüştür. Class yok."
            arff_test_str = self.json2arff_object.convert_for_test(test_features, '')

            feat_json.write(json.dumps(test_features))
            feat_arff.write(arff_test_str)

            feat_arff.close()
            feat_json.close()

            arff_raw = StringIO(arff_test_str)

            test_dataset, test_meta = arff.loadarff(arff_raw)

            test = test_dataset[test_meta.names()]
            test = np.asarray(test.tolist(), dtype=np.float32)
        except:
            self.logger.error("Test verisi ayarlanırken hata  /  Error : {0}".format(format_exc()))
        return test, self.test_parsed_domains

    def create_model_NB(self):

        #train, target = self.preparing_train_data()
        train, target = self.train, self.train_label#self.preparing_train_data()
        gnb = GaussianNB()
        #model = gnb.fit(train, target)
        model = gnb.fit(self.train, self.train_label)

        return model

    def create_model_RF(self):
        #train, target = self.preparing_train_data()
        train, target = self.train, self.train_label#self.preparing_train_data()
        clf = GradientBoostingClassifier(n_estimators=10, max_depth=7,random_state=0, verbose=0)
        #model = clf.fit(train, target)
        model = clf.fit(self.train, self.train_label)

        return model

    def create_model_GB(self):
        train, target = self.train, self.train_label#self.preparing_train_data()
        clf = GradientBoostingClassifier(n_estimators=10, max_depth=7,random_state=0, verbose=0)
        #clf = GradientBoostingClassifier(n_estimators=10, max_depth=7, random_state=0, verbose=0, max_features=50, min_samples_split=50, min_samples_leaf=50)

        model = clf.fit(self.train, self.train_label)


        return model

    def active_create_model_NB(self):

        train, target = self.preparing_train_data()
        gnb = GaussianNB()
        model = gnb.fit(train, target)

        return model

    def active_create_model_RF(self):
        train, target = self.preparing_train_data()
        clf = GradientBoostingClassifier(n_estimators=10, max_depth=7,random_state=0, verbose=0)
        model = clf.fit(train, target)

        return model

    def active_create_model_GB(self):
        #train, target = self.preparing_train_data()
        train, target = self.train, self.train_label#self.preparing_train_data()

        clf = GradientBoostingClassifier(n_estimators=10, max_depth=7, random_state=0, verbose=0)
        model = clf.fit(train, target)

        return model

    def model_run(self, test):

        model = self.model
        model_pre = model.predict(test)
        model_probability = model.predict_proba(test)

        model_pre_list = []
        for p in model_pre:
            model_pre_list.append(str(p).replace("b'", "").replace("'", ""))

        model_probability = model_probability.tolist()

        return model_pre_list, model_probability

    def active_model_run(self, test):

        model = self.active_model
        model_pre = model.predict(test)
        model_probability = model.predict_proba(test)

        model_pre_list = []
        for p in model_pre:
            model_pre_list.append(str(p).replace("b'", "").replace("'", ""))

        model_probability = model_probability.tolist()

        return model_pre_list, model_probability

    def output(self, test_data):

        test= self.test
        #self.test_parsed_domains = self.parser_object.parse_nonlabeled_samples(test_data)

        model_pre, model_probability = self.model_run(test)
        #test_parsed_domain = self.test_parsed_domains
        result_list = []

        for test_domain in range(len(self.test)):#test_parsed_domain:
            result = {}
            result['id'] = test_domain
            result['class'] = model_pre[test_domain]
            result_list.append(result)

        test_result = open("../output/test-output/result-"+self.time_now+".txt", "w")
        test_result.write(json.dumps(result_list))
        test_result.close()

        return result_list

    def active_output(self, test_data):

        test, test_parsed_domains = self.preparing_test_data(test_data)
        model_pre, model_probability = self.active_model_run(test)

        test_parsed_domain = self.test_parsed_domains
        result_list = []
        #print(model_pre, model_probability)

        for test_domain in test_parsed_domain:
            result = {}
            result['domain'] = test_domain['url']
            result['id'] = test_domain['id']
            result['predicted_class'] = model_pre[test_domain['id']]
            result['probability_phish'] = (model_probability[test_domain['id']][1] / sum(model_probability[test_domain['id']])) * 100
            result['probability_legitimate'] = (model_probability[test_domain['id']][0] / sum(model_probability[test_domain['id']])) * 100
            result_list.append(result)

        test_result = open("../output/test-output/result-"+self.time_now+".txt", "w")
        test_result.write(json.dumps(result_list))
        test_result.close()

        return result_list


    def accuracy(self):
        model = self.model
        test_data, test_label = self.preparing_train_data()
        scores = cross_val_score(model, test_data, test_label, cv=10)

        return scores

    def test_accuracy(self):
        test_file = self.txt_to_list( (open("../output/features/dp29114.txt",'r')))

        self.result_list = self.output(test_file)
        count = 0
        print("Starting...\n")
        for i in range(len(self.test)):

            if self.test_label[i].decode('utf-8') == self.result_list[i]['class']:
                count = count + 1

        acc = ( count / len(self.test) ) * 100

        return acc,count


    def confusion_matrix(self, name):
        """
        train dataseti gsb.arff model içerisinde bu dataset var.
        confisioun matris çıkarmayı istediğimiz datayı preparing_train_data fonksiyonu ile arff formatı okunur.
        okunan dosya data ve label olarak bölünür.
        data model üzerinde çalıştırılır.
        elde edilen tahmin sonuçlarına ilişkin labellar model_preye atılır.

        test_label--bytes array formatında unicode formatına dönüştürülür

        ardından confusion matrix çalıştırılır.
        :param name:
        :return:
        """

        test, test_label = self.preparing_train_data(file_name=name)
        model_pre, model_pro = self.model_run(test)

        test_label_unicode = []

        for t in test_label:
            test_label_unicode.append(str(t, 'utf-8'))
        active = confusion_matrix(test_label_unicode, model_pre, labels=['phish', 'legitimate'])

        test = self.train
        test_label = self.train_label

        model_pre, model_pro = self.model_run(test)

        test_label_unicode = []

        for t in test_label:
            test_label_unicode.append(str(t, 'utf-8'))

        return confusion_matrix(test_label_unicode, model_pre, labels=['phish', 'legitimate']), active
Exemple #4
0
class nlp_class:
    def __init__(self):
        self.logger = NsLog("log")
        self.path_data = "../data/"
        self.name_keywords = "keywords.txt"
        self.name_brand_file = "allbrand.txt"
        self.name_random_model = "gib_model.pki"

        model_data = pickle.load(open(self.name_random_model, 'rb'))
        self.model_mat = model_data['mat']
        self.threshold = model_data['thresh']

        self.allbrand = self.__txt_to_list(
            open("{0}{1}".format(self.path_data, self.name_brand_file), "r"))
        self.keywords = self.__txt_to_list(
            open("{0}{1}".format(self.path_data, self.name_keywords), "r"))

    def __txt_to_list(self, txt_object):
        list = []

        for line in txt_object:
            list.append(line.strip())
        txt_object.close()
        return list

    def __is_similar_to_any_element(self, word, list):

        target = ''
        for l in list:
            if editdistance.eval(word, l) < 2:
                target = l

        if len(target) > 0:
            return word
        else:
            return 0

    def parse(self, words):

        keywords_in_words = []
        brands_in_words = []
        similar_to_brands = []
        similar_to_keywords = []
        dga_in_words = []
        len_gt_7 = []
        len_lt_7 = []
        try:
            for word in words:

                word = re.sub("\d+", "", word)

                if word in self.keywords:
                    keywords_in_words.append(word)

                elif word in self.allbrand:
                    brands_in_words.append(word)

                elif self.__is_similar_to_any_element(word,
                                                      self.allbrand) != 0:
                    target = self.__is_similar_to_any_element(
                        word, self.allbrand)
                    similar_to_brands.append(target)

                elif self.__is_similar_to_any_element(word,
                                                      self.keywords) != 0:
                    target = self.__is_similar_to_any_element(
                        word, self.keywords)
                    similar_to_keywords.append(target)

                elif len(word) > 3 and not word.isnumeric():

                    if (gib_detect_train.avg_transition_prob(
                            word, self.model_mat) > self.threshold) == False:
                        dga_in_words.append(word)
                        # todo keyword benzeri olanlar temizlenmeli
                    elif len(word) < 7:
                        len_lt_7.append(word)
                    else:
                        len_gt_7.append(word)

            result = {
                'keywords_in_words': keywords_in_words,
                'brands_in_words': brands_in_words,
                'dga_in_words': dga_in_words,
                'len_lt_7': len_lt_7,
                'len_gt_7': len_gt_7,
                'similar_to_brands': similar_to_brands,
                'similar_to_keywords': similar_to_keywords
            }
        except:
            self.logger.debug(str(words) + " işlenirken hata")
            self.logger.error("Error : {0}".format(format_exc()))

        return result

    def fraud_analysis(self, grouped_words, splitted_words):

        word_list = grouped_words['len_lt_7'] + grouped_words[
            'similar_to_brands'] + grouped_words[
                'similar_to_keywords'] + splitted_words

        word_list_nlp = grouped_words['len_lt_7'] + grouped_words['similar_to_brands'] + \
                        grouped_words['similar_to_keywords'] + grouped_words['brands_in_words'] + \
                        grouped_words['keywords_in_words'] + grouped_words['dga_in_words'] + splitted_words

        found_keywords = []
        found_brands = []
        similar_to_keyword = []
        similar_to_brand = []
        other_words = []
        target_words = {'brand': [], 'keyword': []}
        try:
            for word in word_list:

                word = re.sub("\d+", "", word)

                if word in self.keywords:
                    found_keywords.append(word)
                elif word in self.allbrand:
                    found_brands.append(word)
                else:

                    for brand in self.allbrand:
                        if editdistance.eval(word, brand) < 2:
                            target_words['brand'].append(brand)
                            similar_to_brand.append(word)

                    for keyword in self.keywords:
                        if editdistance.eval(word, keyword) < 2:
                            target_words['keyword'].append(keyword)
                            similar_to_keyword.append(word)

                if word not in found_keywords + found_brands + similar_to_keyword + similar_to_brand:
                    other_words.append(word)

            result = {
                'found_keywords': found_keywords,
                'found_brands': found_brands,
                'similar_to_keywords': similar_to_keyword,
                'similar_to_brands': similar_to_brand,
                'other_words': other_words,
                'target_words': target_words,
                'words_nlp': word_list_nlp
            }
        except:
            self.logger.debug(str(word_list) + " işlenirken hata")
            self.logger.error("Error : {0}".format(format_exc()))
        return result

    def evaluate(self, grouped_words, fraud_analyze_result, splitted_words):
        """
        grouped_words
        keywords_in_words, brands_in_words,
        dga_in_words, len_lt_7, len_gt_7 

        fraud_anaylze_result
        found_keywords, found_brands,
        similar_to_keyword, similar_to_brand,
        other_words, target_words 
        """
        try:
            words_raw = grouped_words['keywords_in_words'] + grouped_words['brands_in_words'] + \
                        grouped_words['similar_to_brands'] + grouped_words['similar_to_keywords'] + \
                        grouped_words['dga_in_words'] + grouped_words['len_lt_7'] + grouped_words['len_gt_7']

            words_len = []
            compound_word_len = []

            for word in words_raw:
                words_len.append(len(word))

            for word in grouped_words['len_gt_7']:
                compound_word_len.append(len(word))

            all_keywords = grouped_words[
                'keywords_in_words'] + fraud_analyze_result['found_keywords']
            all_brands = grouped_words[
                'brands_in_words'] + fraud_analyze_result['found_brands']
            similar_brands = fraud_analyze_result['similar_to_brands']
            similar_keywords = fraud_analyze_result['similar_to_keywords']

            if len(compound_word_len) == 0:
                av_com = 0
            else:
                av_com = float(np.average(compound_word_len))

            if len(words_len) == 0:
                min = 0
                max = 0
                av_w = 0
                std = 0
            else:
                min = int(np.min(words_len))
                max = int(np.max(words_len))
                av_w = float(np.average(words_len))
                std = float(np.std(words_len))

            result = {
                'info': {
                    'keywords': all_keywords,
                    'brands': all_brands,
                    'dga_in_words': grouped_words['dga_in_words'],
                    'similar_to_keywords': similar_keywords,
                    'similar_to_brands': similar_brands,
                    'negligible_words': fraud_analyze_result['other_words'],
                    'target_words': fraud_analyze_result['target_words'],
                    'words_nlp': fraud_analyze_result['words_nlp']
                },
                'features': {
                    'raw_word_count':
                    len(words_len),
                    'splitted_word_count':
                    len(splitted_words),
                    'average_word_length':
                    av_w,
                    'longest_word_length':
                    max,
                    'shortest_word_length':
                    min,
                    'std_word_length':
                    std,
                    'compound_word_count':
                    len(grouped_words['len_gt_7']),
                    'keyword_count':
                    len(all_keywords),
                    'brand_name_count':
                    len(all_brands),
                    'negligible_word_count':
                    len(fraud_analyze_result['other_words']),
                    'target_brand_count':
                    len(fraud_analyze_result['target_words']['brand']),
                    'target_keyword_count':
                    len(fraud_analyze_result['target_words']['keyword']),
                    'similar_keyword_count':
                    len(similar_keywords),
                    'similar_brand_count':
                    len(similar_brands),
                    'average_compound_words':
                    av_com,
                    'random_words':
                    len(grouped_words['dga_in_words'])
                }
            }
        except:
            self.logger.error("Error : {0}".format(format_exc()))
        return result

    def check_word_random(self, word):

        if gib_detect_train.avg_transition_prob(
                word, self.model_mat) < self.threshold:
            return 1
        else:
            return 0
Exemple #5
0
class Train:
    def __init__(self):
        self.logger = NsLog("log")

        self.json2arff_object = json2arff()
        self.parser_object = domain_parser()
        self.rule_calculation = rule_extraction()

        self.path_input = "../input/"
        self.path_arff = "../output/arff/"
        self.path_features = "../output/features/"
        self.path_parsed_domain = "../output/domain_parser/"

    def txt_to_list(self, txt_object):

        lst = []

        for line in txt_object:
            lst.append(line.strip())

        txt_object.close()

        return lst

    def domain_parser(self, param):

        parsed_domains = []

        for i in range(1, len(param), 2):
            try:
                if param[i + 1] == 'phish' or param[i + 1] == 'legitimate':

                    #dataset = self.txt_to_list(open("{0}{1}".format(self.path_input, param[i]), "r"))  # txt read
                    dataset = json.loads(
                        open("{0}{1}".format(self.path_input, param[i]),
                             "r").read())  # json read

                    parsed_domains = parsed_domains + self.parser_object.parse(
                        dataset, param[i + 1], len(parsed_domains))

                else:
                    self.logger.debug(
                        "class labels must be entered one of (phish, legitimate)"
                    )

            except:
                self.logger.error("an error is occurred  : {0}".format(
                    format_exc()))
                self.logger.debug(
                    "an error occurred when | {0}.txt | file was processing".
                    format(param))

        self.logger.info(
            "Domain Parse process is done {0} unique URLs are parsed".format(
                len(parsed_domains)))

        return parsed_domains

    def json_to_file(self, name, path, data):
        time_now = str(datetime.datetime.now())[0:19].replace(" ", "_")
        file_name = name + "_" + time_now + ".txt"
        file = open(path + file_name, "w")
        file.write(json.dumps(data))
        file.close()
        self.logger.info("{0} Dosyaya Yazıldı.".format(name))

    def arff_to_file(self, name, path, data):
        time_now = str(datetime.datetime.now())[0:19].replace(" ", "_")
        file_name = name + "_" + time_now + ".txt"
        file = open(path + file_name, "w")
        file.write(data)
        file.close()
        self.logger.info("{0} Dosyaya Yazıldı.".format(name))
class active_rules:
    def __init__(self):
        self.pp = pprint.PrettyPrinter(indent=2)
        self.logger = NsLog("log")

    def goog_safe_browsing(self, domain_features):

        try:

            url_list = []
            updated_domain_features = domain_features
            for sample in domain_features:
                url_list.append(sample['info']['url'])

            sep_list = self.__seperate(url_list, 500)

            #phishing_url_list = self.get_urls(self.google_sb_query(list, sep_list.index(list), len(domain_features)))  # aktif

            phishing_url_list = self.get_urls(json.loads(open("constant/gb_phish.json", "r").read()))  #dosyadan

            updated_domain_features = []
            for each in domain_features:
                element = each
                if each['info']['url'] in phishing_url_list:
                    element.update({'active_features': {'google_safe_browsing': 1}})
                else:
                    element.update({'active_features': {'google_safe_browsing': 0}})

                updated_domain_features.append(element)
        except:
            self.logger.error("Error : {0}".format(format_exc()))

        return updated_domain_features

    def __seperate(self, url_list, size):

        sep_urls = []

        k = int((len(url_list)/size)+1)

        for i in range(1, k+1):
            if (i*size) > len(url_list):
                sep_urls.append(url_list[(i - 1) * size : len(url_list)])
            else:
                sep_urls.append(url_list[(i-1)*size: i*size])

        return sep_urls

    def google_sb_query(self, url_list, count, overall_count):

        query_url_list = self.sb_format(url_list)
        sep_list = self.__seperate(query_url_list, 500)

        for list in sep_list:
            time_now = str(datetime.datetime.now())[0:19].replace(" ", "_")

            api_key = 'AIzaSyCGmGpCMt-PNQTrWAsp3LqcM_UvCF6NJ1I'
            url = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
            payload = {'client': {'clientId': "mycompany", 'clientVersion': "0.1"},
                       'threatInfo': {'threatTypes': ["SOCIAL_ENGINEERING", "MALWARE"],
                                      'platformTypes': ["ANY_PLATFORM"],
                                      'threatEntryTypes': ["URL"],
                                      'threatEntries': list
                                      }}
            params = {'key': api_key}
            r = requests.post(url, params=params, json=payload).json()

            phish_url_list = []
            if 'matches' in r.keys():
                for each in r['matches']:
                    phish_url_list.append(each['threat']['url'])
            elif 'error' in r.keys():
                self.logger.debug("Google-SB sorgusunda hata - Toplam işlenen örnek sayısı: "+overall_count+"\nişlenen parça (500): "+count)

        return phish_url_list

    def sb_format(self, url_list):

        sb_query = []
        for url in url_list:
            sb_query.append({'url': url})

        return sb_query

    def get_urls(self, ph_db_json):

        urls = []

        for obj in ph_db_json:
            urls.append(obj['url'])

        return urls

    def txt_to_list(self, txt_object):
        list = []

        for line in txt_object:
            list.append(line.strip())
        txt_object.close()
        return list