Ejemplo n.º 1
0
def execute_run(DATA_SET):

    global LOGGER

    batch_size = 1024
    anomaly_ratio = 0.2
    if DATA_SET == 'kddcup':
        anomaly_ratio = 0.2
    elif DATA_SET =='kddcup_neptune':
        anomaly_ratio = 0.2
    elif DATA_SET == 'nsl_kdd':
        batch_size = 512
        anomaly_ratio = 0.1
    elif DATA_SET == 'nb15':
        batch_size = 512
        anomaly_ratio = 0.1
    elif DATA_SET == 'gureKDD':
        batch_size = 1024
        anomaly_ratio = 0.1

    data_dict, _ = data_fetcher.get_data(
        DATA_SET,
        one_hot=True,
        num_anom_sets=5,
        anomaly_ratio=anomaly_ratio
    )

    train_df = data_dict['train']
    train_X = train_df.values
    encoder_structure_config, decoder_structure_config, gmm_structure_config, _, latent_dim = create_config(
        DATA_SET
    )
    pprint(encoder_structure_config)
    pprint(decoder_structure_config)

    dagmm_obj = DaGMM(
        DEVICE,
        encoder_structure_config,
        decoder_structure_config,
        n_gmm = gmm_structure_config['num_components'],
        ae_latent_dim=latent_dim
    )
    dagmm_obj = dagmm_obj.to(DEVICE)
    print(dagmm_obj)

    dagmm_obj = train(
        dagmm_obj,
        train_X,
        num_epochs=400,
        batch_size=batch_size,
        LR=0.0001
    )
    mean_aupr, std = test(
        dagmm_obj,
        data_dict
    )

    return  mean_aupr, std
Ejemplo n.º 2
0
def create_config(
        data_set
):
    # Should return :
    # data_dict
    # meta_data_df [column, dimension]

    config_file = 'architecture_config.yaml'

    with open(config_file, 'r') as fh:
        config = yaml.safe_load(fh)

    data_dict, meta_data_df = data_fetcher.get_data(data_set, one_hot=True)

    # discrete_columns : { column_name : num_categories }
    discrete_column_dims = {
        k: v for k, v in
        zip(list(meta_data_df['column']), list(meta_data_df['dimension']))
    }

    num_discrete_columns = 0
    for column, dim in discrete_column_dims.items():
        if dim == 2:
            num_discrete_columns += 1
        else:
            num_discrete_columns += dim

    num_real_columns = len(data_dict['train'].columns) - num_discrete_columns
    print('Num real columns :: ', num_real_columns)
    print('Num discrete columns ::', num_discrete_columns)

    latent_dim = config[data_set]['ae_latent_dimension']

    encoder_structure_config = {}
    encoder_structure_config['discrete_column_dims'] = discrete_column_dims
    encoder_structure_config['num_discrete'] = num_discrete_columns
    encoder_structure_config['num_real'] = num_real_columns
    encoder_structure_config['encoder_layers'] = {
        'activation': config[data_set]['encoder_layers']['activation'],
        'layer_dims': config[data_set]['encoder_layers']['layer_dims'] + [latent_dim]
    }

    # ======================================================
    # Set decoder structure
    # =========

    decoder_structure_config = {}
    final_op_dims = num_real_columns
    for k, v in discrete_column_dims.items():
        if v == 2:
            v = 1
        final_op_dims += v

    decoder_structure_config['discrete_column_dims'] = discrete_column_dims
    decoder_structure_config['num_discrete'] = num_discrete_columns
    decoder_structure_config['num_real'] = num_real_columns
    decoder_structure_config['decoder_layers'] = {
        'activation': config[data_set]['decoder_layers']['activation'],
        'layer_dims': [latent_dim] + config[data_set]['decoder_layers']['layer_dims'] + [final_op_dims]
    }
    decoder_structure_config['final_output_dim'] = final_op_dims

    # =====================
    # GMM
    # =====================
    gmm_input_dims = latent_dim + 2
    activation = config[data_set]['gmm']['FC_layer']['activation']
    num_components = config[data_set]['gmm']['num_components']
    FC_layer_dims = [gmm_input_dims] + config[data_set]['gmm']['FC_layer']['dims'] + [num_components]
    FC_dropout = config[data_set]['gmm']['FC_dropout']
    gmm_structure_config = {
        'num_components': num_components,
        'FC_layer_dims': FC_layer_dims,
        'FC_dropout': FC_dropout,
        'FC_activation': activation

    }
    loss_structure_config = []

    for column, dim in discrete_column_dims.items():
        loss_structure_config.append(
            {
                'dim': dim,
                'type': 'onehot'
            }
        )
    loss_structure_config.append(
        {
            'dim': num_real_columns,
            'type': 'real'
        }
    )


    return encoder_structure_config, decoder_structure_config, gmm_structure_config, loss_structure_config, latent_dim
Ejemplo n.º 3
0
def execute_run(DATA_SET):

    global LOGGER
    encoder_structure_config, decoder_structure_config, loss_structure_config, latent_dim = create_config(
        DATA_SET)
    anomaly_ratio = -1
    ae_model = None
    config_file = 'architecture_config.yaml'

    with open(config_file, 'r') as fh:
        config = yaml.safe_load(fh)

    anomaly_ratio = config[DATA_SET]['anomaly_ratio']
    LR = config[DATA_SET]['LR']
    batch_size = config[DATA_SET]['batch_size']
    epochs = config[DATA_SET]['epochs']
    dropout = config[DATA_SET]['ae_dropout']

    ae_model = Model(DEVICE,
                     latent_dim,
                     encoder_structure_config,
                     decoder_structure_config,
                     loss_structure_config,
                     optimizer='Adam',
                     batch_size=batch_size,
                     num_epochs=epochs,
                     learning_rate=LR,
                     dropout=dropout)

    print(ae_model.network_module)

    num_anomaly_sets = 5
    data_dict, _ = data_fetcher.get_data(DATA_SET,
                                         one_hot=True,
                                         num_anom_sets=num_anomaly_sets,
                                         anomaly_ratio=anomaly_ratio)

    train_df = data_dict['train']
    train_X = train_df.values
    ae_model.train_model(train_X)
    test_norm_df = data_dict['test']
    test_norm_X = test_norm_df.values

    auc_list = []
    ae_model.mode = 'test'

    def _normalize_(val, _min, _max):
        return (val - _min) / (_max - _min)

    for idx in range(1, num_anomaly_sets + 1):
        key = 'anom_' + str(idx)
        test_anom_df = data_dict[key]
        test_anom_X = test_anom_df.values
        x1 = test_norm_X
        x2 = test_anom_X

        x1_scores = ae_model.get_score(x1)
        x2_scores = ae_model.get_score(x2)

        res_data = []
        labels = [1 for _ in range(x1.shape[0])
                  ] + [0 for _ in range(x2.shape[0])]
        _scores = np.concatenate([x1_scores, x2_scores], axis=0)
        for i, j in zip(_scores, labels):
            res_data.append((i, j))

        res_df = pd.DataFrame(res_data, columns=['score', 'label'])
        res_df = res_df.sort_values(by=['score'], ascending=False)

        _max = max(res_df['score'])
        _min = min(res_df['score'])

        res_df['score'] = res_df['score'].parallel_apply(_normalize_,
                                                         args=(
                                                             _min,
                                                             _max,
                                                         ))

        _max = max(res_df['score'])
        _min = min(res_df['score'])

        step = (_max - _min) / 100

        # Vary the threshold
        thresh = _max - step
        thresh = round(thresh, 3)
        num_anomalies = x2.shape[0]
        P = []
        R = [0]
        while thresh >= _min:
            sel = res_df.loc[res_df['score'] >= thresh]
            if len(sel) == 0:
                thresh -= step
                continue

            correct = sel.loc[sel['label'] == 0]

            prec = len(correct) / len(sel)
            rec = len(correct) / num_anomalies
            P.append(prec)
            R.append(rec)
            thresh -= step
            thresh = round(thresh, 3)

        P = [P[0]] + P
        pr_auc = auc(R, P)
        try:
            plt.figure(figsize=[8, 6])
            plt.plot(R, P)
            plt.title('Precision Recall Curve  || auPR :' +
                      "{:0.4f}".format(pr_auc),
                      fontsize=15)
            plt.xlabel('Recall', fontsize=15)
            plt.ylabel('Precision', fontsize=15)
            plt.show()
        except:
            pass
        print("AUC : {:0.4f} ".format(pr_auc))
        auc_list.append(pr_auc)
    _mean = np.mean(auc_list)
    _std = np.std(auc_list)
    print(' Mean AUC {:0.4f} '.format(_mean))
    print(' AUC std {:0.4f} '.format(_std))
    return _mean, _std
Ejemplo n.º 4
0
num_runs = args.num_runs
LOG_FILE = 'log_results_{}.txt'.format(DATA_SET)
LOGGER = utils.get_logger(LOG_FILE)
utils.log_time(LOGGER)
LOGGER.info(DATA_SET)
config_file = 'config.yaml'
with open(config_file, 'r') as fh:
    config = yaml.safe_load(fh)

num_anomaly_sets = config[DATA_SET]['num_anomaly_sets']
anomaly_ratio = config[DATA_SET]['anomaly_ratio']
results = []

for n in range(1, num_runs + 1):
    data_dict, _ = data_fetcher.get_data(DATA_SET,
                                         one_hot=True,
                                         num_anom_sets=num_anomaly_sets,
                                         anomaly_ratio=anomaly_ratio)

    dcn_obj = train_model(DATA_SET, data_dict, config)
    mean_aupr, std = test_eval(dcn_obj, data_dict, num_anomaly_sets)

    results.append(mean_aupr)
    LOGGER.info(' Run {}: Mean: {:4f} | Std {:4f}'.format(n, mean_aupr, std))

mean_all_runs = np.mean(results)
print('Mean AuPR over  {} runs {:4f}'.format(num_runs, mean_all_runs))
print('Details: ', results)

LOGGER.info('Mean AuPR over  {} runs {:4f} Std {:4f}'.format(
    num_runs, mean_all_runs, np.std(results)))
LOGGER.info(' Details ' + str(results))
Ejemplo n.º 5
0
        try:
            plt.figure()
            plt.title('PR Curve' + str(pr_auc))
            plt.plot(R, P)
            plt.show()
        except:
            pass
    return




# =================================== #
data_set = 'kddcup'
data_dict, _ = data_fetcher.get_data(
    data_set,
    one_hot=False
)

train_df = data_dict['train']
train_X = train_df.values

encoder_structure_config, decoder_structure_config, gmm_structure_config, _, latent_dim = create_config(
    data_set
)
pprint(encoder_structure_config)
pprint(decoder_structure_config)
# =================================== #


dagmm_obj = DaGMM(
    DEVICE,