def load_new_data(clf='bin', cols=None): ''' 加载新采样后数据 :param clf: 加载二分类数据或多分类数据 :param cols: 取部分列作为训练数据 :return: 返回划分好的数据集 X_t, X_v, y_t, y_v ''' if clf == 'bin': bin_data = pd.read_csv('/data1/lxf/DGA/data/bin_78_10w.csv') return train_data_split(bin_data, label_name='type') elif clf == 'mul': if cols is not None: new_id = [x for x in range(len(cols))] new_dict = dict(zip(cols, new_id)) # 打印label和家族名称的对应顺序 print(new_dict) mul_data = pd.read_csv('/data1/lxf/DGA/data/mul_78_10w.csv') mul_data = mul_data[mul_data['family'].isin(cols)] mul_data['family'] = mul_data['family'].apply( lambda x: new_dict[x]) else: mul_data = pd.read_csv('/data1/lxf/DGA/data/mul_78_10w.csv') return train_data_split(mul_data, label_name='family') else: mul_test = pd.read_csv('/data1/lxf/DGA/data/mul_test.csv') label = mul_test['family'] features = mul_test.drop(['family'], axis=1, inplace=True) return features, label
def lgb_bin_gridsearch(): # 使用邹哥采样的训练数据,加载带id和label的二分类数据集,将数据集分为训练用和测试用 train_set, test_set = load_bin_data(topn=400000) X_t, X_v, y_t, y_v = train_data_split(train_set) n_es = xgb_bin_model(X_t, X_v, y_t, y_v) param_test1 = { 'max_depth': range(3, 8, 2), 'min_child_weight': range(2, 6, 2) } gsearch1 = GridSearchCV(estimator=XGBClassifier( learning_rate=0.1, n_estimators=n_es, max_depth=6, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=-1, scale_pos_weight=1, seed=2020), param_grid=param_test1, scoring='roc_auc', n_jobs=-1, cv=10, verbose=2) gsearch1.fit(X_t, y_t) print('训练使用数据量 :%s' % train_set.shape[0]) print( 'n_estimators = %s\nmax_depth = %s\nmin_child_weight = %s\nbest_score_ = %s' % (n_es, gsearch1.best_params_['max_depth'], gsearch1.best_params_['min_child_weight'], gsearch1.best_score_))
def lstm(train): x_t, x_v, y_t, y_v = train_data_split(train) x_train = np.reshape(x_t.values, (x_t.shape[0], x_t.shape[1], 1)) x_test = np.reshape(x_v.values, (x_v.shape[0], x_v.shape[1], 1)) y_train = to_categorical(y_t.values, num_classes=2) y_test = to_categorical(y_v.values, num_classes=2) # params nb_lstm_outputs = 30 # 神经元个数 data_input = x_t.shape[1] # build model model = Sequential() model.add(LSTM(units=nb_lstm_outputs, input_shape=(data_input, 1))) model.add(Dense(2, activation='softmax')) # compile:loss, optimizer, metrics model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # train: epcoch, batch_size model.fit(x_train, y_train, epochs=30, batch_size=128, verbose=1) model.summary() score = model.evaluate(x_test, y_test, batch_size=128, verbose=1) print('===LSTM F1 score %s===' % score) return model
def rf_model(train): # 参数 ''' n_estimators=100, 树的个数 criterion="gini", 分裂方式 [entropy, gini] max_depth=None, 树最大深度 min_samples_split=2, 节点分裂所需的最小样本数 min_samples_leaf=1, 叶节点所需的最小样本数 min_weight_fraction_leaf=0., The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided. max_features="auto", 寻找最佳分裂时需要的样本数量 max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None ''' rfc = RandomForestClassifier(random_state=2020, verbose=1, n_jobs=-1) x_t, x_v, y_t, y_v = train_data_split(train) rfc.fit(x_t, y_t) pre = rfc.predict(x_v) print('Random Forest Model F1 Score: %s' % _f1_score(pre, y_v)) print(rfc.feature_importances_) return rfc
def load_mul_data(): dga_mul = '/data0/new_workspace/mlxtend_dga_multi_20190316/merge/new_feature/dga_multi.csv' mul_data = pd.read_csv(dga_mul) cols = [ 'type', 'domain_len', '_contains_digits', '_subdomain_lengths_mean', '_n_grams0', '_n_grams1', '_n_grams4', '_hex_part_ratio', '_alphabet_size', '_shannon_entropy', '_consecutive_consonant_ratio', 'domain_seq35', 'domain_seq36', 'domain_seq38', 'domain_seq39', 'domain_seq40', 'domain_seq41', 'domain_seq42', 'domain_seq43', 'domain_seq46', 'domain_seq47', 'domain_seq48', 'domain_seq49', 'domain_seq50', 'domain_seq51', 'domain_seq52', 'domain_seq53', 'domain_seq54', 'domain_seq55', 'domain_seq56', 'domain_seq57', 'domain_seq58', 'domain_seq59', 'domain_seq60', 'domain_seq61', 'domain_seq62', 'domain_seq63', 'domain_seq64', 'domain_seq65', 'domain_seq66', 'domain_seq67', 'domain_seq68', 'domain_seq69', 'domain_seq70', 'domain_seq71', 'domain_seq72', 'domain_seq73', 'domain_seq74', 'domain_seq75' ] mul_data.columns = cols mul_data['id'] = mul_data.index mul_data['label'] = mul_data['type'].astype('int') - 1 mul_data.drop(['type'], axis=1, inplace=True) X_t, X_v, y_t, y_v = train_data_split(mul_data, _size=0.9) X_t['label'] = y_t X_v['label'] = y_v X_t['id'] = X_t.index X_v['id'] = X_v.index return X_t, X_v
def load_bin_data(topn=None, return_all=False): dga_bin = '/data0/new_workspace/mlxtend_dga_bin_20190307/merge/new_feature/dga_bin.csv' legit = '/data0/new_workspace/mlxtend_dga_bin_20190307/merge/new_feature/legit.csv' black = pd.read_csv(dga_bin) white = pd.read_csv(legit) print('black shape %s, %s' % (black.shape[0], black.shape[1])) print('white shape %s, %s' % (white.shape[0], white.shape[1])) # 合并黑白样本 if topn is None: topn_b, topn_w = black.shape[0], white.shape[0] else: topn_b, topn_w = topn, topn black['label'] = [1] * black.shape[0] white['label'] = [0] * white.shape[0] black['id'] = black.index white['id'] = white.index data = pd.concat([black.head(topn_b), white.head(topn_w)]) data.drop(['type'], axis=1, inplace=True) X_t, X_v, y_t, y_v = train_data_split(data, _size=0.2) X_t['label'] = y_t X_v['label'] = y_v X_t['id'] = X_t.index X_v['id'] = X_v.index if return_all: return data.reindex(np.random.permutation(data.index)) # 打乱行顺序 else: return X_t, X_v
def load_zrz_data(clf='bin', min_features=False, int_type=False): ''' :param clf: bin or mul :param min_features: 是否减少特征,减少特征的list为全局变量 :param int_type: 是否将训练数据转化为int类型 :return: 划分好的训练集和验证集 ''' if clf == 'mul': zrz_mul = '/data0/new_workspace/mlxtend_dga_multi_20190316/merge/new_feature/dga_multi.csv' mul_data = pd.read_csv(zrz_mul) cols = [ 'type', 'domain_len', '_contains_digits', '_subdomain_lengths_mean', '_n_grams0', '_n_grams1', '_n_grams4', '_hex_part_ratio', '_alphabet_size', '_shannon_entropy', '_consecutive_consonant_ratio', 'domain_seq35', 'domain_seq36', 'domain_seq38', 'domain_seq39', 'domain_seq40', 'domain_seq41', 'domain_seq42', 'domain_seq43', 'domain_seq46', 'domain_seq47', 'domain_seq48', 'domain_seq49', 'domain_seq50', 'domain_seq51', 'domain_seq52', 'domain_seq53', 'domain_seq54', 'domain_seq55', 'domain_seq56', 'domain_seq57', 'domain_seq58', 'domain_seq59', 'domain_seq60', 'domain_seq61', 'domain_seq62', 'domain_seq63', 'domain_seq64', 'domain_seq65', 'domain_seq66', 'domain_seq67', 'domain_seq68', 'domain_seq69', 'domain_seq70', 'domain_seq71', 'domain_seq72', 'domain_seq73', 'domain_seq74', 'domain_seq75' ] mul_data.columns = cols mul_data['type'] = mul_data['type'].astype('int') - 1 if min_features: mul_data.drop(c_20000, axis=1, inplace=True) if int_type: mul_data.astype('int') print('数据集读取完成, 特征数量:%s行, %s列' % (mul_data.shape[0], mul_data.shape[1])) return train_data_split(mul_data, label_name='type') else: dga_bin = '/data0/new_workspace/mlxtend_dga_bin_20190307/merge/new_feature/dga_bin.csv' legit = '/data0/new_workspace/mlxtend_dga_bin_20190307/merge/new_feature/legit.csv' black = pd.read_csv(dga_bin) white = pd.read_csv(legit) black['label'] = [1] * black.shape[0] white['label'] = [0] * white.shape[0] data = pd.concat([black, white]) data.drop(['type'], axis=1, inplace=True) print('数据集读取完成') return train_data_split(data)
def cnn_1d_(train): import numpy as np import keras seed = 2020 x_t, x_v, y_t, y_v = train_data_split(train) train = x_t.values.reshape(x_t.shape[0], x_t.shape[1], 1) label = y_t.values test = x_v.values.reshape(x_v.shape[0], x_v.shape[1], 1) test_label = y_v.values # 定义卷积层 filters = 1 # 卷积核数量为 1 kernel_size = 5 # 卷积核大小为 5 convolution_1d_layer = keras.layers.convolutional.Conv1D(filters, kernel_size, strides=1, padding='same', input_shape=(x_t.shape[1], 1), activation="relu", name="convolution_1d_layer") # 定义最大化池化层 max_pooling_layer = keras.layers.MaxPool1D(pool_size=5, strides=1, padding="valid", name="max_pooling_layer") # 平铺层,调整维度适应全链接层 reshape_layer = keras.layers.core.Flatten(name="reshape_layer") # 定义全链接层 full_connect_layer = keras.layers.Dense(1, kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.1, seed=seed), bias_initializer="random_normal", use_bias=True, activation='sigmoid', name="full_connect_layer") # 编译模型 model = keras.Sequential() model.add(convolution_1d_layer) model.add(max_pooling_layer) model.add(reshape_layer) model.add(full_connect_layer) # compile model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # fit model.fit(train, label, epochs=50, batch_size=100, validation_split=0.2) # 打印网络结构 print(model.summary()) # 验证集效果 res = model.predict(test) _res = [round(x[0]) for x in res.tolist()] _score = _f1_score(_res, test_label) print('===F1 score: %s===' % _score) return model
def __init__(self, train_data): self.rounds = 2000 self.early_stop = 10 self.X_t, self.X_v, self.y_t, self.y_v = train_data_split(train_data) self.modelname = None