Beispiel #1
0
def evaluation(sess, model, ratio):
    (sub_ano, sub_ano_label), _ = _split_dataset(ano, ano_label,
                                                 mapping_ratio[ratio])
    x = np.concatenate((norm, sub_ano), axis=0)
    y = np.concatenate((norm_label, sub_ano_label), axis=0)

    ano_scores = []
    for _, batch_data in DataInput(x, test_batch_size):
        _ano_score = model.eval(sess, batch_data)
        # Extend
        ano_scores += list(_ano_score)
    ano_scores = np.array(ano_scores).reshape((-1, 1))
    # Calculate auc
    auroc = calc_auroc(y, ano_scores)
    print('Anomaly ratio:{:.4f}\tEval_auroc:{:.4f}'.format(ratio, auroc))
    prec, rec, f1 = calc_metric(y, ano_scores)
    print('Prec:{:.4f}\tRec:{:.4f}\tF1:{:.4f}\n'.format(prec, rec, f1))
Beispiel #2
0
mode = 'talkingdata'
test_batch_size = 1024
method = 'fm'  # 'fm' or 'cross-e'
weight = 0.9
degree = 1
logdir = create_logdir(mode, method, weight, degree)
save_path = os.path.join(base_dir, logdir)
ano_size = 228423

with open('{}_dataset.pkl'.format(mode), 'rb') as f:
    train_set = pickle.load(f)
    test_set = pickle.load(f)
    count_list = pickle.load(f)

x_test, y_test = test_set
(x, y), _ = _split_dataset(x_test, y_test, percentage=0.2)
print('test set:', x.shape)


def evaluation(sess, model):
    ano_scores = []
    for _, batch_data in DataInput(x, test_batch_size):
        _ano_score = model.eval(sess, batch_data)
        # Extend
        ano_scores += list(_ano_score)
    ano_scores = np.array(ano_scores).reshape((-1, 1))
    # Calculate auc
    auroc = calc_auroc(y, ano_scores)
    print('Eval_auroc:{:.4f}'.format(auroc))
    prec, rec, f1 = calc_metric(y, ano_scores)
    print('Prec:{:.4f}\tRec:{:.4f}\tF1:{:.4f}\n'.format(prec, rec, f1))
if not os.path.exists('{}_dataset.pkl'.format(mode)):
    build_dataset(mode)

# prepare data
with open('{}_dataset.pkl'.format(mode), 'rb') as f:
    train_set = pickle.load(f)
    test_set = pickle.load(f)
    count_list = pickle.load(f)

x_train, y_train = train_set
x_test, y_test = test_set

x, y = x_train[:-ano_size], y_train[:-ano_size]  # for training
x_train_ano, y_train_ano = x_train[-ano_size:], y_train[-ano_size:]
(x_val, y_val), _ = _split_dataset(x_test, y_test, percentage=0.1)

x, y = x[:size], y[:size]
# contaminated data
# x = np.concatenate((x, x_train_ano[:10000]), axis=0)
# y = np.concatenate((y, y_train_ano[:10000]), axis=0)
print('training set:', x.shape)
print('validation set:', x_val.shape)
print(np.sum(y))

del x_train, y_train, x_test, y_test, _
gc.collect()


def _eval(sess, model, test_data, label):
    ano_scores = []
def build_dataset(mode):

    col_names = [
        "duration", "protocol_type", "service", "flag", "src_bytes",
        "dst_bytes", "land", "wrong_fragment", "urgent", "hot",
        "num_failed_logins", "logged_in", "num_compromised", "root_shell",
        "su_attempted", "num_root", "num_file_creations", "num_shells",
        "num_access_files", "num_outbound_cmds", "is_host_login",
        "is_guest_login", "count", "srv_count", "serror_rate",
        "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
        "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
        "dst_host_srv_count", "dst_host_same_srv_rate",
        "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
        "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
        "dst_host_srv_serror_rate", "dst_host_rerror_rate",
        "dst_host_srv_rerror_rate", "label"
    ]

    df = pd.read_csv('data/kddcup.data_10_percent_corrected',
                     header=None,
                     names=col_names)

    # One-hot encoding
    text_l = [
        'protocol_type', 'service', 'flag', 'land', 'logged_in',
        'is_host_login', 'is_guest_login'
    ]
    for name in text_l:
        _encode_text_dummy(df, name)

    # Label mapping
    labels = df['label'].copy()
    f = lambda x: 1 if (x == 'normal.') else 0
    df.loc[:, 'label'] = labels.map(f)

    df_train = df.sample(frac=0.5, random_state=42)
    df_test = df.loc[~df.index.isin(df_train.index)].copy()

    y_train = df_train['label'].values
    df_train.drop(['label'], axis=1, inplace=True)
    x_train = df_train.values.astype(np.float32)

    y_test = df_test['label'].values
    df_test.drop(['label'], axis=1, inplace=True)
    x_test = df_test.values.astype(np.float32)

    print('raw training set', x_train.shape)

    # Scaler
    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    # val split
    (x_val, y_val), _ = _split_dataset(x_test, y_test, percentage=0.2)
    print('validation set', x_val.shape)

    # Only use majority class to train
    x_train = x_train[y_train != 1]
    y_train = y_train[y_train != 1]

    print('training set', x_train.shape)
    print('test set', x_test.shape)

    train_set = (x_train, y_train)
    val_set = (x_val, y_val)
    test_set = (x_test, y_test)

    with open('{}_dataset.pkl'.format(mode), 'wb') as f:
        pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL)
        pickle.dump(val_set, f, pickle.HIGHEST_PROTOCOL)
        pickle.dump(test_set, f, pickle.HIGHEST_PROTOCOL)
Beispiel #5
0
def build_dataset(mode):
    col_names = _use_cols()
    df = pd.read_csv('data/sub_normals_1percent.csv',
                     usecols=col_names,
                     parse_dates=['click_time'])  # 685269
    len_df = len(df)
    df_ano = pd.read_csv('data/anomalies.csv',
                         usecols=col_names,
                         parse_dates=['click_time'])  # 456846
    df = df.append(df_ano)
    del df_ano
    gc.collect()

    # parsing date time
    print('==== parsing second, minute, hour, day, dayofweek ====')
    df['second'] = df['click_time'].dt.second.astype('uint8')
    df['minute'] = df['click_time'].dt.minute.astype('uint8')
    df['hour'] = df['click_time'].dt.hour.astype('uint8')
    df['day'] = df['click_time'].dt.day.astype('uint8')
    df['dayofweek'] = df['click_time'].dt.dayofweek.astype('uint8')

    print(df['is_attributed'].value_counts())
    label = df['is_attributed'].copy()
    df.drop(['click_time', 'is_attributed'], axis=1, inplace=True)

    # Label encoding
    df = df.apply(LabelEncoder().fit_transform)
    print(df.columns)
    x = df.values.astype(np.int32)
    y = label.values.astype(np.int32)
    count_list = np.max(x, axis=0) + 1
    print('sub dataset', x.shape)

    df_nor, nor_label = x[:len_df], y[:len_df]
    df_ano, ano_label = x[len_df:], y[len_df:]

    # train and test split, use only normal class to train
    (x_test, y_test), (x_train,
                       y_train) = _split_dataset(df_nor,
                                                 nor_label,
                                                 percentage=0.5)  # 342635

    # sample out test set with specific anomaly ratio = 0.4
    (ano_test, ano_test_label), (ano_train, ano_train_label) = _split_dataset(
        df_ano, ano_label, percentage=0.5)  # 228423

    x_train, y_train = np.concatenate((x_train, ano_train),
                                      axis=0), np.concatenate(
                                          (y_train, ano_train_label), axis=0)

    x_test, y_test = np.concatenate(
        (x_test, ano_test), axis=0), np.concatenate((y_test, ano_test_label),
                                                    axis=0)

    print('test set', x_test.shape)
    print('train set', x_train.shape)
    print(np.sum(y_train), np.sum(y_test))
    print(count_list)

    train_set = (x_train, y_train)
    test_set = (x_test, y_test)

    with open('{}_dataset.pkl'.format(mode), 'wb') as f:
        pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL)
        pickle.dump(test_set, f, pickle.HIGHEST_PROTOCOL)
        pickle.dump(count_list, f, pickle.HIGHEST_PROTOCOL)