def evaluation(sess, model, ratio): (sub_ano, sub_ano_label), _ = _split_dataset(ano, ano_label, mapping_ratio[ratio]) x = np.concatenate((norm, sub_ano), axis=0) y = np.concatenate((norm_label, sub_ano_label), axis=0) ano_scores = [] for _, batch_data in DataInput(x, test_batch_size): _ano_score = model.eval(sess, batch_data) # Extend ano_scores += list(_ano_score) ano_scores = np.array(ano_scores).reshape((-1, 1)) # Calculate auc auroc = calc_auroc(y, ano_scores) print('Anomaly ratio:{:.4f}\tEval_auroc:{:.4f}'.format(ratio, auroc)) prec, rec, f1 = calc_metric(y, ano_scores) print('Prec:{:.4f}\tRec:{:.4f}\tF1:{:.4f}\n'.format(prec, rec, f1))
mode = 'talkingdata' test_batch_size = 1024 method = 'fm' # 'fm' or 'cross-e' weight = 0.9 degree = 1 logdir = create_logdir(mode, method, weight, degree) save_path = os.path.join(base_dir, logdir) ano_size = 228423 with open('{}_dataset.pkl'.format(mode), 'rb') as f: train_set = pickle.load(f) test_set = pickle.load(f) count_list = pickle.load(f) x_test, y_test = test_set (x, y), _ = _split_dataset(x_test, y_test, percentage=0.2) print('test set:', x.shape) def evaluation(sess, model): ano_scores = [] for _, batch_data in DataInput(x, test_batch_size): _ano_score = model.eval(sess, batch_data) # Extend ano_scores += list(_ano_score) ano_scores = np.array(ano_scores).reshape((-1, 1)) # Calculate auc auroc = calc_auroc(y, ano_scores) print('Eval_auroc:{:.4f}'.format(auroc)) prec, rec, f1 = calc_metric(y, ano_scores) print('Prec:{:.4f}\tRec:{:.4f}\tF1:{:.4f}\n'.format(prec, rec, f1))
if not os.path.exists('{}_dataset.pkl'.format(mode)): build_dataset(mode) # prepare data with open('{}_dataset.pkl'.format(mode), 'rb') as f: train_set = pickle.load(f) test_set = pickle.load(f) count_list = pickle.load(f) x_train, y_train = train_set x_test, y_test = test_set x, y = x_train[:-ano_size], y_train[:-ano_size] # for training x_train_ano, y_train_ano = x_train[-ano_size:], y_train[-ano_size:] (x_val, y_val), _ = _split_dataset(x_test, y_test, percentage=0.1) x, y = x[:size], y[:size] # contaminated data # x = np.concatenate((x, x_train_ano[:10000]), axis=0) # y = np.concatenate((y, y_train_ano[:10000]), axis=0) print('training set:', x.shape) print('validation set:', x_val.shape) print(np.sum(y)) del x_train, y_train, x_test, y_test, _ gc.collect() def _eval(sess, model, test_data, label): ano_scores = []
def build_dataset(mode): col_names = [ "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label" ] df = pd.read_csv('data/kddcup.data_10_percent_corrected', header=None, names=col_names) # One-hot encoding text_l = [ 'protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login' ] for name in text_l: _encode_text_dummy(df, name) # Label mapping labels = df['label'].copy() f = lambda x: 1 if (x == 'normal.') else 0 df.loc[:, 'label'] = labels.map(f) df_train = df.sample(frac=0.5, random_state=42) df_test = df.loc[~df.index.isin(df_train.index)].copy() y_train = df_train['label'].values df_train.drop(['label'], axis=1, inplace=True) x_train = df_train.values.astype(np.float32) y_test = df_test['label'].values df_test.drop(['label'], axis=1, inplace=True) x_test = df_test.values.astype(np.float32) print('raw training set', x_train.shape) # Scaler scaler = MinMaxScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) # val split (x_val, y_val), _ = _split_dataset(x_test, y_test, percentage=0.2) print('validation set', x_val.shape) # Only use majority class to train x_train = x_train[y_train != 1] y_train = y_train[y_train != 1] print('training set', x_train.shape) print('test set', x_test.shape) train_set = (x_train, y_train) val_set = (x_val, y_val) test_set = (x_test, y_test) with open('{}_dataset.pkl'.format(mode), 'wb') as f: pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL) pickle.dump(val_set, f, pickle.HIGHEST_PROTOCOL) pickle.dump(test_set, f, pickle.HIGHEST_PROTOCOL)
def build_dataset(mode): col_names = _use_cols() df = pd.read_csv('data/sub_normals_1percent.csv', usecols=col_names, parse_dates=['click_time']) # 685269 len_df = len(df) df_ano = pd.read_csv('data/anomalies.csv', usecols=col_names, parse_dates=['click_time']) # 456846 df = df.append(df_ano) del df_ano gc.collect() # parsing date time print('==== parsing second, minute, hour, day, dayofweek ====') df['second'] = df['click_time'].dt.second.astype('uint8') df['minute'] = df['click_time'].dt.minute.astype('uint8') df['hour'] = df['click_time'].dt.hour.astype('uint8') df['day'] = df['click_time'].dt.day.astype('uint8') df['dayofweek'] = df['click_time'].dt.dayofweek.astype('uint8') print(df['is_attributed'].value_counts()) label = df['is_attributed'].copy() df.drop(['click_time', 'is_attributed'], axis=1, inplace=True) # Label encoding df = df.apply(LabelEncoder().fit_transform) print(df.columns) x = df.values.astype(np.int32) y = label.values.astype(np.int32) count_list = np.max(x, axis=0) + 1 print('sub dataset', x.shape) df_nor, nor_label = x[:len_df], y[:len_df] df_ano, ano_label = x[len_df:], y[len_df:] # train and test split, use only normal class to train (x_test, y_test), (x_train, y_train) = _split_dataset(df_nor, nor_label, percentage=0.5) # 342635 # sample out test set with specific anomaly ratio = 0.4 (ano_test, ano_test_label), (ano_train, ano_train_label) = _split_dataset( df_ano, ano_label, percentage=0.5) # 228423 x_train, y_train = np.concatenate((x_train, ano_train), axis=0), np.concatenate( (y_train, ano_train_label), axis=0) x_test, y_test = np.concatenate( (x_test, ano_test), axis=0), np.concatenate((y_test, ano_test_label), axis=0) print('test set', x_test.shape) print('train set', x_train.shape) print(np.sum(y_train), np.sum(y_test)) print(count_list) train_set = (x_train, y_train) test_set = (x_test, y_test) with open('{}_dataset.pkl'.format(mode), 'wb') as f: pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL) pickle.dump(test_set, f, pickle.HIGHEST_PROTOCOL) pickle.dump(count_list, f, pickle.HIGHEST_PROTOCOL)