Example #1
0
def load_new_data(clf='bin', cols=None):
    '''
    加载新采样后数据
    :param clf: 加载二分类数据或多分类数据
    :param cols: 取部分列作为训练数据
    :return: 返回划分好的数据集 X_t, X_v, y_t, y_v
    '''
    if clf == 'bin':
        bin_data = pd.read_csv('/data1/lxf/DGA/data/bin_78_10w.csv')
        return train_data_split(bin_data, label_name='type')
    elif clf == 'mul':
        if cols is not None:
            new_id = [x for x in range(len(cols))]
            new_dict = dict(zip(cols, new_id))
            # 打印label和家族名称的对应顺序
            print(new_dict)
            mul_data = pd.read_csv('/data1/lxf/DGA/data/mul_78_10w.csv')
            mul_data = mul_data[mul_data['family'].isin(cols)]
            mul_data['family'] = mul_data['family'].apply(
                lambda x: new_dict[x])
        else:
            mul_data = pd.read_csv('/data1/lxf/DGA/data/mul_78_10w.csv')
        return train_data_split(mul_data, label_name='family')
    else:
        mul_test = pd.read_csv('/data1/lxf/DGA/data/mul_test.csv')
        label = mul_test['family']
        features = mul_test.drop(['family'], axis=1, inplace=True)
        return features, label
Example #2
0
def lgb_bin_gridsearch():
    # 使用邹哥采样的训练数据,加载带id和label的二分类数据集,将数据集分为训练用和测试用
    train_set, test_set = load_bin_data(topn=400000)

    X_t, X_v, y_t, y_v = train_data_split(train_set)
    n_es = xgb_bin_model(X_t, X_v, y_t, y_v)

    param_test1 = {
        'max_depth': range(3, 8, 2),
        'min_child_weight': range(2, 6, 2)
    }
    gsearch1 = GridSearchCV(estimator=XGBClassifier(
        learning_rate=0.1,
        n_estimators=n_es,
        max_depth=6,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=-1,
        scale_pos_weight=1,
        seed=2020),
                            param_grid=param_test1,
                            scoring='roc_auc',
                            n_jobs=-1,
                            cv=10,
                            verbose=2)
    gsearch1.fit(X_t, y_t)
    print('训练使用数据量 :%s' % train_set.shape[0])
    print(
        'n_estimators = %s\nmax_depth = %s\nmin_child_weight = %s\nbest_score_ = %s'
        % (n_es, gsearch1.best_params_['max_depth'],
           gsearch1.best_params_['min_child_weight'], gsearch1.best_score_))
def lstm(train):
    x_t, x_v, y_t, y_v = train_data_split(train)
    x_train = np.reshape(x_t.values, (x_t.shape[0], x_t.shape[1], 1))
    x_test = np.reshape(x_v.values, (x_v.shape[0], x_v.shape[1], 1))
    y_train = to_categorical(y_t.values, num_classes=2)
    y_test = to_categorical(y_v.values, num_classes=2)

    # params
    nb_lstm_outputs = 30  # 神经元个数
    data_input = x_t.shape[1]

    # build model
    model = Sequential()
    model.add(LSTM(units=nb_lstm_outputs, input_shape=(data_input, 1)))
    model.add(Dense(2, activation='softmax'))

    # compile:loss, optimizer, metrics
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    # train: epcoch, batch_size
    model.fit(x_train, y_train, epochs=30, batch_size=128, verbose=1)

    model.summary()

    score = model.evaluate(x_test, y_test, batch_size=128, verbose=1)
    print('===LSTM F1 score %s===' % score)

    return model
def rf_model(train):
    # 参数
    '''
     n_estimators=100,            树的个数
     criterion="gini",            分裂方式  [entropy, gini]
     max_depth=None,              树最大深度
     min_samples_split=2,         节点分裂所需的最小样本数
     min_samples_leaf=1,          叶节点所需的最小样本数
     min_weight_fraction_leaf=0., The minimum weighted fraction of the sum total of weights (of all
                                  the input samples) required to be at a leaf node. Samples have
                                  equal weight when sample_weight is not provided.
     max_features="auto",         寻找最佳分裂时需要的样本数量
     max_leaf_nodes=None,
     min_impurity_decrease=0.,
     min_impurity_split=None,
     bootstrap=True,
     oob_score=False,
     n_jobs=None,
     random_state=None,
     verbose=0,
     warm_start=False,
     class_weight=None,
     ccp_alpha=0.0,
     max_samples=None
    '''

    rfc = RandomForestClassifier(random_state=2020, verbose=1, n_jobs=-1)
    x_t, x_v, y_t, y_v = train_data_split(train)
    rfc.fit(x_t, y_t)
    pre = rfc.predict(x_v)
    print('Random Forest Model F1 Score: %s' % _f1_score(pre, y_v))
    print(rfc.feature_importances_)
    return rfc
def load_mul_data():
    dga_mul = '/data0/new_workspace/mlxtend_dga_multi_20190316/merge/new_feature/dga_multi.csv'
    mul_data = pd.read_csv(dga_mul)
    cols = [
        'type', 'domain_len', '_contains_digits', '_subdomain_lengths_mean',
        '_n_grams0', '_n_grams1', '_n_grams4', '_hex_part_ratio',
        '_alphabet_size', '_shannon_entropy', '_consecutive_consonant_ratio',
        'domain_seq35', 'domain_seq36', 'domain_seq38', 'domain_seq39',
        'domain_seq40', 'domain_seq41', 'domain_seq42', 'domain_seq43',
        'domain_seq46', 'domain_seq47', 'domain_seq48', 'domain_seq49',
        'domain_seq50', 'domain_seq51', 'domain_seq52', 'domain_seq53',
        'domain_seq54', 'domain_seq55', 'domain_seq56', 'domain_seq57',
        'domain_seq58', 'domain_seq59', 'domain_seq60', 'domain_seq61',
        'domain_seq62', 'domain_seq63', 'domain_seq64', 'domain_seq65',
        'domain_seq66', 'domain_seq67', 'domain_seq68', 'domain_seq69',
        'domain_seq70', 'domain_seq71', 'domain_seq72', 'domain_seq73',
        'domain_seq74', 'domain_seq75'
    ]
    mul_data.columns = cols
    mul_data['id'] = mul_data.index
    mul_data['label'] = mul_data['type'].astype('int') - 1
    mul_data.drop(['type'], axis=1, inplace=True)
    X_t, X_v, y_t, y_v = train_data_split(mul_data, _size=0.9)
    X_t['label'] = y_t
    X_v['label'] = y_v
    X_t['id'] = X_t.index
    X_v['id'] = X_v.index
    return X_t, X_v
def load_bin_data(topn=None, return_all=False):
    dga_bin = '/data0/new_workspace/mlxtend_dga_bin_20190307/merge/new_feature/dga_bin.csv'
    legit = '/data0/new_workspace/mlxtend_dga_bin_20190307/merge/new_feature/legit.csv'
    black = pd.read_csv(dga_bin)
    white = pd.read_csv(legit)
    print('black shape %s, %s' % (black.shape[0], black.shape[1]))
    print('white shape %s, %s' % (white.shape[0], white.shape[1]))

    # 合并黑白样本
    if topn is None:
        topn_b, topn_w = black.shape[0], white.shape[0]
    else:
        topn_b, topn_w = topn, topn

    black['label'] = [1] * black.shape[0]
    white['label'] = [0] * white.shape[0]
    black['id'] = black.index
    white['id'] = white.index
    data = pd.concat([black.head(topn_b), white.head(topn_w)])
    data.drop(['type'], axis=1, inplace=True)

    X_t, X_v, y_t, y_v = train_data_split(data, _size=0.2)
    X_t['label'] = y_t
    X_v['label'] = y_v
    X_t['id'] = X_t.index
    X_v['id'] = X_v.index

    if return_all:
        return data.reindex(np.random.permutation(data.index))  # 打乱行顺序
    else:
        return X_t, X_v
Example #7
0
def load_zrz_data(clf='bin', min_features=False, int_type=False):
    '''
    :param clf: bin or mul
    :param min_features: 是否减少特征,减少特征的list为全局变量
    :param int_type: 是否将训练数据转化为int类型
    :return: 划分好的训练集和验证集
    '''
    if clf == 'mul':
        zrz_mul = '/data0/new_workspace/mlxtend_dga_multi_20190316/merge/new_feature/dga_multi.csv'
        mul_data = pd.read_csv(zrz_mul)
        cols = [
            'type', 'domain_len', '_contains_digits',
            '_subdomain_lengths_mean', '_n_grams0', '_n_grams1', '_n_grams4',
            '_hex_part_ratio', '_alphabet_size', '_shannon_entropy',
            '_consecutive_consonant_ratio', 'domain_seq35', 'domain_seq36',
            'domain_seq38', 'domain_seq39', 'domain_seq40', 'domain_seq41',
            'domain_seq42', 'domain_seq43', 'domain_seq46', 'domain_seq47',
            'domain_seq48', 'domain_seq49', 'domain_seq50', 'domain_seq51',
            'domain_seq52', 'domain_seq53', 'domain_seq54', 'domain_seq55',
            'domain_seq56', 'domain_seq57', 'domain_seq58', 'domain_seq59',
            'domain_seq60', 'domain_seq61', 'domain_seq62', 'domain_seq63',
            'domain_seq64', 'domain_seq65', 'domain_seq66', 'domain_seq67',
            'domain_seq68', 'domain_seq69', 'domain_seq70', 'domain_seq71',
            'domain_seq72', 'domain_seq73', 'domain_seq74', 'domain_seq75'
        ]
        mul_data.columns = cols
        mul_data['type'] = mul_data['type'].astype('int') - 1
        if min_features:
            mul_data.drop(c_20000, axis=1, inplace=True)
        if int_type:
            mul_data.astype('int')
        print('数据集读取完成, 特征数量:%s行, %s列' %
              (mul_data.shape[0], mul_data.shape[1]))
        return train_data_split(mul_data, label_name='type')
    else:
        dga_bin = '/data0/new_workspace/mlxtend_dga_bin_20190307/merge/new_feature/dga_bin.csv'
        legit = '/data0/new_workspace/mlxtend_dga_bin_20190307/merge/new_feature/legit.csv'
        black = pd.read_csv(dga_bin)
        white = pd.read_csv(legit)
        black['label'] = [1] * black.shape[0]
        white['label'] = [0] * white.shape[0]
        data = pd.concat([black, white])
        data.drop(['type'], axis=1, inplace=True)
        print('数据集读取完成')
        return train_data_split(data)
Example #8
0
def cnn_1d_(train):
    import numpy as np
    import keras

    seed = 2020
    x_t, x_v, y_t, y_v = train_data_split(train)
    train = x_t.values.reshape(x_t.shape[0], x_t.shape[1], 1)
    label = y_t.values

    test = x_v.values.reshape(x_v.shape[0], x_v.shape[1], 1)
    test_label = y_v.values

    # 定义卷积层
    filters = 1  # 卷积核数量为 1
    kernel_size = 5  # 卷积核大小为 5
    convolution_1d_layer = keras.layers.convolutional.Conv1D(filters, kernel_size, strides=1, padding='same',
                                                             input_shape=(x_t.shape[1], 1), activation="relu",
                                                             name="convolution_1d_layer")
    # 定义最大化池化层
    max_pooling_layer = keras.layers.MaxPool1D(pool_size=5, strides=1, padding="valid", name="max_pooling_layer")

    # 平铺层,调整维度适应全链接层
    reshape_layer = keras.layers.core.Flatten(name="reshape_layer")

    # 定义全链接层
    full_connect_layer = keras.layers.Dense(1, kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.1,
                                                                                                  seed=seed),
                                            bias_initializer="random_normal", use_bias=True, activation='sigmoid',
                                            name="full_connect_layer")

    # 编译模型
    model = keras.Sequential()
    model.add(convolution_1d_layer)
    model.add(max_pooling_layer)
    model.add(reshape_layer)
    model.add(full_connect_layer)

    # compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # fit
    model.fit(train, label, epochs=50, batch_size=100, validation_split=0.2)

    # 打印网络结构
    print(model.summary())

    # 验证集效果
    res = model.predict(test)
    _res = [round(x[0]) for x in res.tolist()]
    _score = _f1_score(_res, test_label)
    print('===F1 score: %s===' % _score)

    return model
Example #9
0
 def __init__(self, train_data):
     self.rounds = 2000
     self.early_stop = 10
     self.X_t, self.X_v, self.y_t, self.y_v = train_data_split(train_data)
     self.modelname = None