def get_features(self, domain_list, psl):
     domain_labels = []
     domain_indexes = []
     index = 0
     for d in domain_list:
         d = d[:d.rindex(psl.publicsuffix(d)) - 1]
         if len(d) == 0:
             continue
         d_labels = d.split(".")
         for l in d_labels:
             domain_labels.append(l)
             domain_indexes.append(index)
         index = index + 1
     label_features = extract_all_features(domain_labels)
     minMax = MinMaxScaler()
     label_features = minMax.fit_transform(label_features)
     zero_array = np.zeros(32)
     domain_features = []
     i = 0
     while i < len(label_features):
         if i == len(label_features
                     ) - 1 or domain_indexes[i] != domain_indexes[i + 1]:
             domain_features.append(np.append(zero_array,
                                              label_features[i]))
             i = i + 1
         else:
             domain_features.append(
                 np.append(label_features[i], label_features[i + 1]))
             i = i + 2
     return domain_features
Exemple #2
0
 def get_dataset(self, DGADomain,benignDomain):
     domains = DGADomain + benignDomain
     y = np.concatenate((np.ones(len(DGADomain)), np.zeros(len(benignDomain))))
     allLabels, index = self.getAllDomainLabels(domains)
     labelFeatures = char_feature.extract_all_features(allLabels)
     X = self.unionFeature(labelFeatures, index)
     if len(X)!=len(y):
         print("error")
     return X,y
    def get_dataset(self, bfile):
        trainDGADomain, testDGADomain, trainBenignDomain, testBenignDomain = self.pontus.getTrainTestDomains(
            benignFile="../data_sets/{}".format(bfile))
        trainData = trainDGADomain + trainBenignDomain

        y_train = np.concatenate((np.ones(len(trainDGADomain)), np.zeros(len(trainBenignDomain))))

        testData = testDGADomain + testBenignDomain
        y_test = np.concatenate((np.ones(len(testDGADomain)), np.zeros(len(testBenignDomain))))

        allLabels, index = self.pontus.getAllDomainLabels(trainData)
        labelFeatures = extract_all_features(allLabels)
        X_train = self.pontus.unionFeature(labelFeatures, index)

        allLabels, index = self.pontus.getAllDomainLabels(testData)
        labelFeatures = extract_all_features(allLabels)
        X_test = self.pontus.unionFeature(labelFeatures, index)

        return X_train, y_train, X_test, y_test
Exemple #4
0
def createdataset(type="train",
                  AGD_file="../data_sets/split_AGDs",
                  BD_file="../data_sets/split_benign_nx.json",
                  datasetname="nx_train_data"):
    if type == "train":
        v_index = 0
    else:
        v_index = 1
    psl = PublicSuffixList()
    with open(AGD_file, "r") as f:
        AGD_dict = json.loads(f.read())
    with open(BD_file, "r") as f:
        bd_dict = json.loads(f.read())
    allAGDs = set()
    allBDs = set()
    for k, v in AGD_dict.items():
        for d in v[v_index]:
            pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1]
            for l in pre_d.split("."):
                allAGDs.add(l)
    for d in bd_dict[type]:
        pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1]
        for l in pre_d.split("."):
            allBDs.add(l)
    length = len(allAGDs)
    print(length)
    allBDs = list(allBDs)[:length]
    allAGDs = list(allAGDs)
    alldomains = allAGDs + allBDs
    alllabels = list(np.ones(length)) + list(np.zeros(length))
    allfeatures = extract_all_features(alldomains)
    np.save("../data_sets/{}_features.npy".format(datasetname), allfeatures)
    data = dict()
    data["domains"] = pd.Series(alldomains, dtype='str')
    data["labels"] = pd.Series(alllabels, dtype='int32')
    df = pd.DataFrame(data=data)
    df.to_csv("../data_sets/{}.csv".format(datasetname), index=False)
Exemple #5
0
def MY_expirement_process(root_dir="/home/yandingkui/dga_detection/result_data/", m_file="split_AGDs",
                          benign_file="split_benign_ac.json", n=815, m=10, c='entropy'):
    psl=PublicSuffixList()
    with open(root_dir + m_file, "r") as f:
        malicious_data = json.loads(f.read())

    with open(root_dir + benign_file, "r") as f:
        benign_data = json.loads(f.read())

    train_domains = []
    train_labels = []
    pred_domains = []
    pred_labels = []
    for k, v in malicious_data.items():
        for d in v[0]:
            d_split = d[:d.index(psl.publicsuffix(d)) - 1].split(".")
            if len(d_split) == 1:
                train_domains.append(d_split[0])
            else:
                m = 0
                lm = None
                for l in d_split:
                    if len(l) > m:
                        lm = l
                train_domains.append(lm)
            train_labels.append(1)
        for d in v[1]:
            pred_domains.append(d)
            pred_labels.append(1)

    for d in benign_data.get("train"):
        pri_d=psl.privatesuffix(d)
        lm=pri_d[:pri_d.index(psl.publicsuffix(pri_d))-1]
        train_domains.append(lm)
        train_labels.append(0)
    for d in benign_data.get("pred"):
        pred_domains.append(d)
        pred_labels.append(0)

    train_features = char_feature.extract_all_features(train_domains)

    index = list(range(len(train_domains)))
    random.shuffle(index)

    real_train_features = []
    real_train_labels = []
    for i in index:
        real_train_features.append(train_features[i])
        real_train_labels.append(train_labels[i])

    # clf = RandomForestClassifier(n_estimators=800, random_state=0)
    # {'criterion': 'entropy', 'max_features': 14, 'n_estimators': 820, 'random_state': 0}
    clf = RandomForestClassifier(n_estimators=n, max_features=m, criterion=c, random_state=0)
    # print("features")
    # n_es_list=range(750,850,5)
    # max_fea_list=range(10,30,2)
    # tuned_parameters = [{'n_estimators':n_es_list , 'random_state': [0],'max_features': max_fea_list,'criterion':["gini","entropy"]}]

    # clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5,scoring='accuracy',n_jobs=30)

    clf.fit(real_train_features, real_train_labels)
    # print("best_params:")
    # print(clf.best_params_)
    print("Pontus:feature_importance_")
    im=clf.feature_importances_
    feature_items=[]
    for i in range(len(im)):
        feature_items.append((i+1,im[i]))
    feature_items.sort(key=takeSecond,reverse=True)
    print(feature_items)
Exemple #6
0
 def getDomainLabelFeatures(self, domainlabels):
     return char_feature.extract_all_features(domainlabels)
Exemple #7
0
def get_suspicious(year, month, day):
    timestring = "{}{:0>2d}{:0>2d}".format(year, month, day)
    suspicious_domains_set = set()
    if os.path.exists("../result_data/{}domains.txt".format(timestring)):
        with open("../result_data/{}domains.txt".format(timestring), "r") as f:
            for r in f:
                suspicious_domains_set.add(r.strip())
        check_active_domains(suspicious_domains_set, timestring)
    else:
        init_domain_set = set()
        # get all domains
        for hour in range(24):
            file_path = "{}{:0>2d}{:0>2d}{:0>2d}".format(year, month, day, hour)
            if not os.path.exists("../result_data/{}".format(file_path)):
                continue
            with open("../result_data/{}".format(file_path), "r") as f:
                for r in f:
                    domain = r.strip().split(",")[1]
                    init_domain_set.add(domain)
        psl = PublicSuffixList()
        domain_labels = []
        labels_labels = []
        i = 0
        # get labels
        domains_list = list(init_domain_set)
        for d in domains_list:
            s = d[:d.index(psl.publicsuffix(d)) - 1]
            for l in s.split("."):
                if len(l) > 0:
                    domain_labels.append(l)
                    labels_labels.append(i)
            i = i + 1

        features_path = "../result_data/{}_features.npy".format(timestring)
        if os.path.exists(features_path):
            features = np.load(features_path)
        else:
            features = extract_all_features(domain_labels)
            np.save(features_path, features)

        # classifier identifies labels
        clf = joblib.load("../result_data/ac_model.m")
        pred_labels = clf.predict(features)
        domain_index = set()
        for i in range(len(labels_labels)):
            if pred_labels[i] == 1:
                domain_index.add(labels_labels[i])
        # get suspicious domains

        for index in domain_index:
            ps = psl.privatesuffix(domains_list[index])
            if ps is None:
                continue
            suspicious_domains_set.add(ps)

        print("{} domains".format(len(suspicious_domains_set)))

        with open("../result_data/{}domains.txt".format(timestring), "w") as f:
            f.write("\n".join(suspicious_domains_set))
        print("save finish")
        # dgarchive check
        check_active_domains(suspicious_domains_set, timestring)
                    root_dir, t)))
            self.save_model(features, labels, type=t, parameters=p)

    def test_model(self, test_data, real_labels, type):
        print("{} model test result:".format(type))
        clf = joblib.load("../result_data/{}_model.m".format(type))
        pred_labels = clf.predict(test_data)
        print(pred_labels)
        print("accuracy:{}\nrecall:{}\nprecision:{}\nf1-score:{}" \
             .format(accuracy_score(pred_labels, real_labels), \
                     recall_score(pred_labels, real_labels), \
                     precision_score(pred_labels, real_labels), \
                     f1_score(pred_labels, real_labels)))

    def test(self):
        types = ["ac", "nx"]
        root_dir = "../data_sets/"
        for i in range(2):
            t = types[i]
            features, labels = self.get_data(
                os.path.abspath("{}{}_pred_data.csv".format(root_dir, t)),
                os.path.abspath("{}{}_pred_data_features.npy".format(
                    root_dir, t)))
            self.test_model(features, labels, t)


if __name__ == "__main__":
    modelextractor = ModelExtractor()
    # modelextractor.test()
    features = extract_all_features(["www", "xxfeee0d8", "validttu"])
    modelextractor.test_model(features, [0, 1, 0], "ac")
Exemple #9
0
 def getDomainFeatures(self,domains):
     allLabels, index = self.getAllDomainLabels(domains)
     labelFeatures = char_feature.extract_all_features(allLabels)
     X = self.unionFeature(labelFeatures, index)
     return X