コード例 #1
0
def process(CONFIG, _DIR, train_x_pos, train_x_neg, test_pos, test_anomaly):
    global logger
    num_neg_samples = train_x_neg.shape[1]
    CONFIG[_DIR]['num_neg_samples'] = num_neg_samples
    model_obj = set_up_model(CONFIG, _DIR)
    _use_pretrained = CONFIG[_DIR]['use_pretrained']

    if _use_pretrained is True:
        saved_file_path = None
        pretrained_file = CONFIG[_DIR]['saved_model_file']

        print('Pretrained File :', pretrained_file)
        saved_file_path = os.path.join(SAVE_DIR, 'checkpoints',
                                       pretrained_file)
        if saved_file_path is not None:
            model_obj.set_pretrained_model_file(saved_file_path)
        else:
            model_obj.train_model(train_x_pos, train_x_neg)

    elif _use_pretrained is False:
        model_obj.train_model(train_x_pos, train_x_neg)

    test_normal_ids = test_pos[0]
    test_anomaly_ids = test_anomaly[0]
    test_ids = list(np.hstack([test_normal_ids, test_anomaly_ids]))
    print(' Len of test_ids ', len(test_ids))
    test_normal_data = test_pos[1]
    test_anomaly_data = test_anomaly[1]
    test_data_x = np.vstack([test_normal_data, test_anomaly_data])

    print('Length of test data', test_data_x.shape)
    res = model_obj.get_event_score(test_data_x)
    print('Length of results ', len(res))

    test_ids = list(test_ids)

    bounds = []
    training_pos_scores = model_obj.get_event_score(train_x_pos)
    training_pos_scores = [_[0] for _ in training_pos_scores]

    train_noise = np.reshape(train_x_neg, [-1, train_x_pos.shape[-1]])
    training_noise_scores = model_obj.get_event_score(train_noise)
    training_noise_scores = [_[0] for _ in training_noise_scores]

    bounds.append(min(training_noise_scores))
    bounds.append(max(training_pos_scores))

    print('Length of results ', len(res))

    res = list(res)
    _id_score_dict = {id: _res for id, _res in zip(test_ids, res)}
    '''
    sort by ascending 
    since lower likelihood means anomalous
    '''
    tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1))
    sorted_id_score_dict = OrderedDict()

    for e in tmp:
        sorted_id_score_dict[e[0]] = e[1][0]

    recall, precison = eval.precision_recall_curve(
        sorted_id_score_dict, anomaly_id_list=test_anomaly_ids, bounds=bounds)

    _auc = auc(recall, precison)
    logger.info('AUC')
    logger.info(str(_auc))

    print('--------------------------')
    plt.figure(figsize=[14, 8])
    plt.plot(recall, precison, color='blue', linewidth=1.75)

    plt.xlabel('Recall', fontsize=15)
    plt.ylabel('Precision', fontsize=15)
    plt.title('Recall | AUC ' + str(_auc), fontsize=15)
    f_name = 'precison-recall_1_test_' + '.png'
    f_path = os.path.join(OP_DIR, f_name)
    plt.show()
    # plt.savefig(f_path)
    plt.close()

    return
コード例 #2
0
def process_all(CONFIG, _DIR, train_x_pos, train_x_neg, testing_dict):
    global logger

    logger.info('setting up number of negative samples ')
    logger.info(CONFIG[_DIR]['num_neg_samples'])

    num_neg_samples = train_x_neg.shape[1]
    CONFIG[_DIR]['num_neg_samples'] = num_neg_samples
    model_obj = set_up_model(CONFIG, _DIR)

    _use_pretrained = CONFIG[_DIR]['use_pretrained']

    if _use_pretrained is True:
        saved_file_path = None
        pretrained_file = CONFIG[_DIR]['saved_model_file']

        print('Pretrained File :', pretrained_file)
        saved_file_path = os.path.join(SAVE_DIR, 'checkpoints',
                                       pretrained_file)
        if saved_file_path is not None:
            model_obj.set_pretrained_model_file(saved_file_path)
        else:
            model_obj.train_model(train_x_pos, train_x_neg)

    elif _use_pretrained is False:
        model_obj.train_model(train_x_pos, train_x_neg)

    # 3 test cases by value of c
    for _c, test_data_item in testing_dict.items():
        print('----->', _c)

        logger.info(' >> c = ' + str(_c))
        test_pos = test_data_item[0]
        test_anomaly = test_data_item[1]

        test_normal_ids = test_pos[0]
        test_anomaly_ids = test_anomaly[0]
        test_ids = list(np.hstack([test_normal_ids, test_anomaly_ids]))

        print(' Len of test_ids ', len(test_ids))
        test_normal_data = test_pos[1]
        test_anomaly_data = test_anomaly[1]
        test_data_x = np.vstack([test_normal_data, test_anomaly_data])

        print('Length of test data', test_data_x.shape)
        res = model_obj.get_event_score(test_data_x)
        print('Length of results ', len(res))

        test_ids = list(test_ids)

        bounds = []
        training_pos_scores = model_obj.get_event_score(train_x_pos)
        training_pos_scores = [_[0] for _ in training_pos_scores]

        train_noise = np.reshape(train_x_neg, [-1, train_x_pos.shape[-1]])
        training_noise_scores = model_obj.get_event_score(train_noise)
        training_noise_scores = [_[0] for _ in training_noise_scores]

        bounds.append(min(training_noise_scores))
        bounds.append(max(training_pos_scores))

        print('Length of results ', len(res))

        res = list(res)
        _id_score_dict = {id: _res for id, _res in zip(test_ids, res)}
        '''
        sort by ascending 
        since lower likelihood means anomalous
        '''
        tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1))
        sorted_id_score_dict = OrderedDict()

        for e in tmp:
            sorted_id_score_dict[e[0]] = e[1][0]

        recall, precison = eval.precision_recall_curve(
            sorted_id_score_dict,
            anomaly_id_list=test_anomaly_ids,
            bounds=bounds)
        '''
        Save the precision recall values
        Later to be plotted in ipynb
        '''
        recall_str = ','.join([str(_) for _ in recall])
        precision_str = ','.join([str(_) for _ in precison])
        logger.info(precision_str)
        logger.info(recall_str)

        _auc = auc(recall, precison)
        logger.info('AUC')
        logger.info(str(_auc))

        print('--------------------------')
コード例 #3
0
def main():
    global DATA_DIR
    global _TIME_IT
    global _DIR
    global OP_DIR
    global SAVE_DIR
    global config
    setup()

    train_x_pos, test_dict_cIdx_data = get_data(DATA_DIR, _DIR)
    '''
    TRAIN model
    '''
    # ---- Core ------ #
    _df_input = []
    for _j in range(train_x_pos.shape[0]):
        _df_input.append(list(train_x_pos[_j]))

    cols = ['f' + str(j) for j in range(train_x_pos.shape[1])]
    X = pd.DataFrame(_df_input,
                     columns=cols,
                     index=[_j for _j in range(train_x_pos.shape[0])],
                     dtype='category')

    estimator = CompreX(logging_level=logging.ERROR)
    estimator.transform(X)
    estimator.fit(X)
    ''' Start of test '''
    for c, _t_data in test_dict_cIdx_data.items():
        test_ids = _t_data[0]
        test_data_x = _t_data[1]
        test_anomaly_idList = _t_data[2]
        start = time.time()

        test_result_r = []
        test_result_p = []

        start_time = time.time()
        res = estimator.predict(test_data_x)
        '''
            'res' is ordered in the order of the input
            match it with the ordered list of ids
        '''
        anomaly_scores = list(res)
        anomaly_score_dict = {k: v for k, v in zip(test_ids, anomaly_scores)}

        # --------------- #
        ''' 
        Sort in reverse order, since higher score means anomaly 
        '''

        tmp = sorted(anomaly_score_dict.items(),
                     key=operator.itemgetter(1),
                     reverse=True)

        sorted_id_score_dict = OrderedDict()
        for e in tmp:
            sorted_id_score_dict[e[0]] = e[1]

        recall, precison = eval.precision_recall_curve(
            sorted_id_score_dict, anomaly_id_list=test_anomaly_idList)

        end_time = time.time()
        time_taken = end_time - start_time
        _auc = auc(recall, precison)

        print('Time taken [seconds]', time_taken, 'AUC', _auc)
        print('--------------------------')
コード例 #4
0
def main():
    global _DIR
    global OP_DIR
    global SAVE_DIR
    global DATA_DIR
    global config
    setup()

    if not os.path.exists(SAVE_DIR):
        os.mkdir(SAVE_DIR)
        if os.path.exists(os.path.join(SAVE_DIR, _DIR)):
            os.mkdir(os.path.join(SAVE_DIR, _DIR))

    checkpoint_dir = os.path.join(SAVE_DIR)
    print(' > ', os.getcwd())

    train_x_pos, train_x_neg, APE_term_2, APE_term_4, test_pos, test_anomaly, domain_dims = data_fetcher.get_data_v1(
        DATA_DIR, _DIR)

    neg_samples = train_x_neg.shape[1]
    start_time = time.time()
    num_domains = len(domain_dims)
    inp_dims = list(domain_dims.values())

    print('Number of domains ', num_domains)
    print(' domains ', inp_dims)

    model_obj = tf_model_ape_1.model_ape_1(MODEL_NAME)

    model_obj.set_model_params(num_entities=num_domains,
                               inp_dims=inp_dims,
                               neg_samples=neg_samples,
                               batch_size=config[_DIR]['batch_size'],
                               num_epochs=config[_DIR]['num_epochs'],
                               lr=config[_DIR]['learning_rate'],
                               chkpt_dir=checkpoint_dir)

    _emb_size = int(config[_DIR]['embed_size'])
    model_obj.set_hyper_parameters(emb_dims=[_emb_size],
                                   use_bias=[True, False])

    _use_pretrained = config[_DIR]['use_pretrained']

    if _use_pretrained is False:
        model_obj.build_model()
        model_obj.train_model(train_x_pos, train_x_neg, APE_term_2, APE_term_4)

    # test for c = 1, 2, 3

    bounds = []
    training_pos_scores = model_obj.inference(train_x_pos)
    training_pos_scores = [_[0] for _ in training_pos_scores]

    train_noise = np.reshape(train_x_neg, [-1, train_x_pos.shape[-1]])
    training_noise_scores = model_obj.inference(train_noise)
    training_noise_scores = [_[0] for _ in training_noise_scores]

    bounds.append(min(training_noise_scores))
    bounds.append(max(training_pos_scores))

    for c in range(1, 3 + 1):

        _, _, _, _, test_pos, test_anomaly, _ = data_fetcher.get_data_v1(
            DATA_DIR, _DIR, c=c)
        '''
        join the normal data + anomaly data
        join the normal data id +  anomaly data id 
        Maintain order
        '''

        test_normal_ids = test_pos[0]
        test_anomaly_ids = test_anomaly[0]
        test_ids = list(np.hstack([test_normal_ids, test_anomaly_ids]))
        print(' Len of test_ids ', len(test_ids))
        test_normal_data = test_pos[1]
        test_anomaly_data = test_anomaly[1]
        test_data_x = np.vstack([test_normal_data, test_anomaly_data])

        # ---------- #

        # ---------- #

        print('Length of test data', test_data_x.shape)
        res = model_obj.inference(test_data_x)

        test_ids = list(test_ids)
        print('Length of results ', len(res))

        res = list(res)
        _id_score_dict = {id: _res for id, _res in zip(test_ids, res)}
        '''
        sort by ascending 
        since lower likelihood means anomalous
        '''
        tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1))
        sorted_id_score_dict = OrderedDict()

        for e in tmp:
            sorted_id_score_dict[e[0]] = e[1][0]

        recall, precison = eval.precision_recall_curve(
            sorted_id_score_dict, anomaly_id_list=test_anomaly_ids)

        recall_str = ','.join([str(_) for _ in recall])
        precision_str = ','.join([str(_) for _ in precison])

        logger.info(precision_str)
        logger.info(recall_str)

        _auc = auc(recall, precison)
        logger.info('c=' + str(c))
        logger.info('AUC')
        logger.info(str(_auc))
        '''
            if _TIME_IT == False:
        
            _auc = auc(recall, precison)
            print('AUC', _auc)
            plt.figure(figsize=[14, 8])
            plt.plot(
                recall,
                precison,
                color='blue', linewidth=1.75)
    
            plt.xlabel('Recall', fontsize=15)
            plt.ylabel('Precision', fontsize=15)
            plt.title('Recall | AUC ' + str(_auc), fontsize=15)
            f_name = 'precison-recall_1_test_' + str(i) + '.png'
            f_path = os.path.join(OP_DIR, f_name)
    
            # plt.savefig(f_path)
            test_result_r.append(recall)
            test_result_p.append(precison)
            plt.close()
        '''

        print('----------------------------')

        end_time = time.time()
        elapsed_time = (end_time - start_time)
        logging.info('time taken')
        logging.info(str(elapsed_time))
コード例 #5
0
def process(_dir=None):
    global DATA_DIR
    global _DIR
    global config
    global OP_DIR
    global logger

    setup(_dir)

    logger.info('--------')
    logger.info(_DIR)
    logger.info('--------')

    k_val = None

    train_x, test_dict_cIdx_data = get_data(DATA_DIR, _DIR)

    if k_val is not None:
        K = k_val
    else:
        K = int(config['K'])
    ''' Start of train '''
    # Number of training instances
    t1 = time.time()
    N = train_x.shape[0]
    obj_ADTree = ad_tree_v1.ADT()
    obj_ADTree.setup(train_x)

    attribute_list = list(range(train_x.shape[1]))

    attribute_set_pairs = get_attribute_sets(train_x,
                                             attribute_list,
                                             obj_ADTree,
                                             k=K)
    t2 = time.time()
    train_time = t2 - t1
    logger.info('train_time taken ' + str(train_time))
    print(train_time)

    print(' Number of attribute set pairs ', len(attribute_set_pairs))
    ''' Start of test '''
    for c, _t_data in test_dict_cIdx_data.items():

        test_ids = _t_data[0]
        test_data_x = _t_data[1]
        test_anomaly_idList = _t_data[2]

        start = time.time()

        results = OrderedDict()
        for _id, record in zip(test_ids, test_data_x):
            _, r_score = get_r_value(_id, record, obj_ADTree,
                                     attribute_set_pairs, N)
            results[_id] = r_score

        end = time.time()
        _time = end - start

        # print(' Time taken :',  end - start)

        tmp = sorted(results.items(), key=operator.itemgetter(1))
        sorted_id_score_dict = OrderedDict()
        for e in tmp:
            sorted_id_score_dict[e[0]] = e[1]

        recall, precison = eval.precision_recall_curve(
            sorted_id_score_dict=sorted_id_score_dict,
            anomaly_id_list=test_anomaly_idList)

        _auc = auc(recall, precison)
        print('AUC ', _auc)
        logger.info('c = ' + str(c))
        logger.info('Time taken ' + str(_time))
        logger.info('AUC : ' + str(_auc))

        plt.figure(figsize=[14, 8])
        plt.plot(recall, precison, color='blue', linewidth=1.75)
        plt.xlabel('Recall', fontsize=15)
        plt.ylabel('Precision', fontsize=15)
        plt.title('Precision Recall | AUC ' + str(_auc), fontsize=15)
        f_name = 'precison-recall_1' + '_test_' + str(c) + '.png'
        f_path = os.path.join(OP_DIR, f_name)
        plt.savefig(f_path)
        plt.close()
コード例 #6
0
clf_1 = LocalOutlierFactor(n_neighbors=20, novelty=True)
clf_1.fit(_train_x)

_test_x = tsvd.transform(test_x)
result = clf_1.score_samples(_test_x)

res = list(result)
_id_score_dict = {id: _res for id, _res in zip(all_ids, res)}
tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1))
sorted_id_score_dict = OrderedDict()

for e in tmp:
    sorted_id_score_dict[e[0]] = e[1]

bounds = []
# training_pos_scores = clf.score_samples(
#     _train_x
# )
# training_pos_scores = [_[0] for _ in training_pos_scores]
from pprint import pprint

pprint(result)
bounds.append(-1)
bounds.append(0)
recall, precison = eval.precision_recall_curve(sorted_id_score_dict,
                                               anomaly_id_list=anom_ids,
                                               bounds=bounds)

_auc = auc(recall, precison)
print('_auc', _auc)