Esempio n. 1
0
def main(argv):
    global _DIR
    global OP_DIR
    global SAVE_DIR

    if not os.path.exists(SAVE_DIR):
        os.mkdir(SAVE_DIR)
        if os.path.exists(os.path.join(SAVE_DIR, _DIR)):
            os.mkdir(os.path.join(SAVE_DIR, _DIR))

    checkpoint_dir = os.path.join(SAVE_DIR)

    print(os.getcwd())

    data_x, test_anom_id, test_all_id, test_x = get_data()
    data, inp_dims = get_training_data(data_x, FLAGS.neg_samples)

    num_domains = len(inp_dims)
    model_obj = APE_tf_model_1.model_ape_1()
    model_obj.set_model_params(num_entities=num_domains,
                               inp_dims=inp_dims,
                               neg_samples=FLAGS.neg_samples,
                               batch_size=FLAGS.batchsize,
                               num_epochs=FLAGS.num_epochs,
                               chkpt_dir=checkpoint_dir)

    model_obj.set_hyper_parameters(emb_dims=[10], use_bias=[True, False])

    print(FLAGS.use_pretrained)
    if FLAGS.use_pretrained is False:
        model_obj.build_model()
        model_obj.train_model(data)

    test_result_r = []
    test_result_p = []
    res = None
    for i in range(len(test_x)):

        _x = test_x[i]
        _x = np.vstack([_x, data_x])
        res = model_obj.inference(_x)
        all_ids = test_all_id[i]
        anomalies = test_anom_id[i]

        _id_score_dict = {id: res for id, res in zip(all_ids, res)}
        tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1))
        sorted_id_score_dict = OrderedDict()
        for e in tmp:
            sorted_id_score_dict[e[0]] = e[1]

        recall, precison = evaluation_v1.precision_recall_curve(
            sorted_id_score_dict, anomaly_id_list=anomalies)

        print('--------------------------')

        from sklearn.metrics import auc
        _auc = auc(recall, precison)
        plt.figure(figsize=[14, 8])
        plt.plot(recall, precison, color='blue', linewidth=1.75)

        plt.xlabel('Recall', fontsize=15)
        plt.ylabel('Precision', fontsize=15)
        plt.title('Recall | AUC ' + str(_auc), fontsize=15)
        f_name = 'precison-recall_1_test_' + str(i) + '.png'
        f_path = os.path.join(OP_DIR, f_name)

        # plt.savefig(f_path)
        test_result_r.append(recall)
        test_result_p.append(precison)
        plt.close()

        print('----------------------------')

        x, y = evaluation_v1.performance_by_score(sorted_id_score_dict,
                                                  anomalies)

        plt.figure(figsize=[14, 8])
        plt.plot(x, y, color='red', linewidth=1.75)
        # plt.xlabel(' ', fontsize=15)
        plt.ylabel('Percentage of anomalies detected', fontsize=15)
        plt.title('Lowest % of scores', fontsize=15)

        f_name = 'score_1_test_' + str(i) + '.png'
        f_path = os.path.join(OP_DIR, f_name)

        plt.savefig(f_path)
        plt.close()

    plt.figure(figsize=[14, 8])
    j = 1
    mean_auc = 0
    for _x, _y in zip(test_result_r, test_result_p):
        plt.plot(_x, _y, linewidth=1.75, label='Test set ' + str(j))
        j += 1
        _auc = auc(_x, _y)
        print(_auc)
        mean_auc += _auc

    mean_auc = mean_auc / len(test_result_r)
    print('Mean ', mean_auc)
    plt.xlabel('Recall', fontsize=15)
    plt.ylabel('Precision', fontsize=15)
    plt.title('Precision Recall Curve', fontsize=17)
    plt.legend(loc='best')
    plt.show()
    plt.close()

    plt.figure(figsize=[14, 8])
    plt.title('Distribution of scores in Model 2', fontsize=17)
    plt.ylabel('Scores', fontsize=15)
    plt.xlabel('Samples', fontsize=15)
    _y = list(sorted(res))
    _x = list(range(len(_y)))
    plt.plot(_x, _y, linewidth=1.75)
    plt.show()
    plt.close()
Esempio n. 2
0
def main(argv):
    global _TIME_IT
    global _DIR
    global OP_DIR
    global SAVE_DIR
    global config
    setup()

    if not os.path.exists(SAVE_DIR):
        os.mkdir(SAVE_DIR)
        if os.path.exists(os.path.join(SAVE_DIR, _DIR)):
            os.mkdir(os.path.join(SAVE_DIR, _DIR))

    checkpoint_dir = os.path.join(SAVE_DIR)
    print(os.getcwd())

    data_x, data_x_id, test_anom_id, test_all_id, test_x = get_data()
    count_test_sets = len(test_x)

    test_result_r = []
    test_result_p = []
    res = None
    start_time = time.time()
    for i in range(count_test_sets):

        train_data_x = np.vstack([data_x, test_x[i]])
        data, inp_dims = get_training_data(train_data_x,
                                           config[_DIR]['neg_samples'],
                                           index=i)

        num_domains = len(inp_dims)
        model_obj = APE_tf_model_1.model_ape_1(MODEL_NAME)
        model_obj.set_model_params(num_entities=num_domains,
                                   inp_dims=inp_dims,
                                   neg_samples=config[_DIR]['neg_samples'],
                                   batch_size=config[_DIR]['batch_size'],
                                   num_epochs=config[_DIR]['num_epocs'],
                                   lr=config[_DIR]['learning_rate'],
                                   chkpt_dir=checkpoint_dir)

        _emb_size = int(config[_DIR]['embed_size'])
        model_obj.set_hyper_parameters(emb_dims=[_emb_size],
                                       use_bias=[True, False])
        _use_pretrained = config[_DIR]['use_pretrained']

        if _use_pretrained is False:
            model_obj.build_model()
            model_obj.train_model(data)
        '''
        join the normal data + anomaly data
        join the normal data id +  anomaly data id 
        Maintain order
        '''
        _x = np.vstack([test_x[i], data_x])
        _x_id = list(test_all_id[i])
        _x_id.extend(data_x_id)

        res = model_obj.inference(_x)

        # Known anomalies
        anomalies = test_anom_id[i]

        _id_score_dict = {id: res for id, res in zip(_x_id, res)}
        '''
        sort by ascending 
        since lower likelihood means anomalous
        '''
        tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1))
        sorted_id_score_dict = OrderedDict()
        for e in tmp:
            sorted_id_score_dict[e[0]] = e[1]

        recall, precison = eval.precision_recall_curve(
            sorted_id_score_dict, anomaly_id_list=anomalies)

        from sklearn.metrics import auc

        _auc = auc(recall, precison)
        print('AUC', _auc)

        print('--------------------------')
        '''
            if _TIME_IT == False:

            _auc = auc(recall, precison)
            print('AUC', _auc)
            plt.figure(figsize=[14, 8])
            plt.plot(
                recall,
                precison,
                color='blue', linewidth=1.75)

            plt.xlabel('Recall', fontsize=15)
            plt.ylabel('Precision', fontsize=15)
            plt.title('Recall | AUC ' + str(_auc), fontsize=15)
            f_name = 'precison-recall_1_test_' + str(i) + '.png'
            f_path = os.path.join(OP_DIR, f_name)

            # plt.savefig(f_path)
            test_result_r.append(recall)
            test_result_p.append(precison)
            plt.close()
        '''

        print('----------------------------')

    end_time = time.time()
    avg_time = (end_time - start_time) / count_test_sets

    all_auc = []
    plt.figure(figsize=[14, 8])
    j = 1
    for _x, _y in zip(test_result_r, test_result_p):
        plt.plot(_x, _y, linewidth=1.75, label='Test set ' + str(j))
        j += 1
        _auc = auc(_x, _y)
        print(_auc)
        all_auc.append(_auc)

    mean_auc = np.mean(all_auc)
    print('Mean AUC', mean_auc)

    print(" ======================== ")
    '''
        plt.xlabel('Recall', fontsize=15)
        plt.ylabel('Precision', fontsize=15)
        plt.title('Precision Recall Curve', fontsize=17)
        plt.legend(loc='best')
        # plt.show()
        plt.close()
    
    '''
    '''
        plt.figure(figsize=[14, 8])
        plt.title('Distribution of scores in Model 2', fontsize=17)
        plt.ylabel('Scores', fontsize=15)
        plt.xlabel('Samples', fontsize=15)
        _y = list(sorted(res))
        _x = list(range(len(_y)))
        plt.plot(
            _x,
            _y,
            linewidth=1.75
        )
    
        # plt.show()
        plt.close()
    
    '''
    # ------------------------------------
    # Save the results
    # ------------------------------------
    _dict = {
        'mean_auc': mean_auc,
        'all_auc': ';'.join([str(_) for _ in all_auc]),
        'time': avg_time
    }

    for k, v in config[_DIR]:
        _dict[k] = str(v)

    _dict = {k: [v] for k, v in _dict.items()}
    df = pd.DataFrame(_dict)

    res_fname = 'ape_result_v2' + str(time.time()).split('.')[0] + '.csv'
    df.to_csv(os.path.join(OP_DIR, res_fname))
    if _TIME_IT:
        print('Time Taken :', avg_time)
def main(argv=None):
    global embedding_dims
    global SAVE_DIR
    global _DIR
    global DATA_DIR
    global config
    global CONFIG_FILE
    global MODEL_NAME
    global DOMAIN_DIMS


    with open(CONFIG_FILE) as f:
        config = yaml.safe_load(f)

    _DIR = config['_DIR']
    DATA_DIR = config['DATA_DIR'] + '/' + _DIR
    setup_general_config()

    if not os.path.exists(os.path.join(SAVE_DIR, 'checkpoints')):
        os.mkdir(os.path.join(SAVE_DIR, 'checkpoints'))

    # ------------ #

    data_x, test_anom_id, test_all_id, test_x, train_ids = get_data()
    DOMAIN_DIMS = get_domain_dims()
    model_obj = set_up_model()

    _use_pretrained = FLAGS.use_pretrained
    if _use_pretrained is False:
        model_obj.train_model(data_x)


    if _use_pretrained is True:
        pretrained_file = None
        if config['saved_model_file'] is None:
            if FLAGS.saved_model_file is not None:
                pretrained_file = FLAGS.saved_model_file
        else:
            pretrained_file = config['saved_model_file']
        print('Pretrained File :', pretrained_file)

        print('Saved file ::', FLAGS.saved_model_file)
        saved_file_path = os.path.join(
            SAVE_DIR,
            'checkpoints',
            pretrained_file
        )
        model_obj.set_pretrained_model_file(saved_file_path)


    test_result_r = []
    test_result_p = []

    for i in range(len(test_x)-1):

        # combine the test and train data - since it is a density based method
        _x = np.vstack([data_x, test_x[i]])


        mean_embeddings = model_obj.get_embedding_mean(_x)
        print(data_x.shape[0], test_x[i].shape[0], _x.shape[0] ,mean_embeddings.shape[0])

        _test_all_id = test_all_id[i]


        _all_ids = list(train_ids)
        _all_ids.extend(list(_test_all_id))


        anomalies = test_anom_id[i]


        # USE LOF here
        sorted_id_score_dict = lof_1.anomaly_1(
            id_list=_all_ids,
            embed_list=mean_embeddings
        )
        print(' >>>> ', len(sorted_id_score_dict))

        _scored_dict_test = {}

        for k1,v in sorted_id_score_dict.items():
            if k1 in _test_all_id or k1 in _scored_dict_test:
                _scored_dict_test[k1] = v

        recall, precison = evaluation_v1.precision_recall_curve(
            _scored_dict_test,
            anomaly_id_list=anomalies
        )
        test_result_r.append(recall)
        test_result_p.append(precison)


        print('--------------------------')


        _auc = auc(recall, precison)
        plt.figure(figsize=[14, 8])
        plt.plot(
            recall,
            precison,
            color='blue', linewidth=1.75)
        plt.xlabel('Recall', fontsize=15)
        plt.ylabel('Precision', fontsize=15)
        plt.title('Recall | AUC ' + str(_auc), fontsize=15)
        f_name = 'precison-recall_1_test_' + str(i) + '.png'
        f_path = os.path.join(OP_DIR, f_name)
        # plt.savefig(f_path)
        plt.close()


    plt.figure(figsize=[14, 8])
    j = 1
    res_str = 'auPR : '
    for _x,_y in zip(test_result_r,test_result_p):
        plt.plot(
            _x,
            _y,
            linewidth=1.75,
            label='Test set ' + str(j)
        )
        j += 1
        _auc = auc(_x, _y)
        res_str = ' ' + "{0:.2f}".format(_auc)
        print(_auc)

    plt.xlabel('Recall', fontsize=15)
    plt.ylabel('Precision', fontsize=15)
    plt.title('Precision Recall Curve ' + res_str, fontsize=18)
    plt.legend(loc='best')
    f_name = 'precison-recall_test_' + str(i) + '.png'
    f_path = os.path.join(OP_DIR, f_name)
    plt.savefig(f_path)
    plt.show()
    plt.close()
def process(idx, CONFIG, _DIR, data_x, test_x, train_ids, test_all_id,
            test_anom_id, test_SerialID, entity_prob_test, eval_type):
    model_obj = set_up_model(CONFIG, _DIR)
    _x = np.vstack([data_x, test_x[idx]])

    model_obj.set_SerialID(test_SerialID[idx])
    _use_pretrained = CONFIG[_DIR]['use_pretrained']

    if _use_pretrained is True:
        saved_file_path = None

        pretrained_file = CONFIG[_DIR]['saved_model_file']
        if type(pretrained_file) == list:
            _match = '_serialID_' + str(test_SerialID[idx])
        _pretrained_file = None
        _match = '_serialID_' + str(test_SerialID[idx])

        if type(pretrained_file) == list:
            # search for the one that matches test_SerialID
            for _p in pretrained_file:
                if _match in _p:
                    _pretrained_file = _p
                    break

            print('Pretrained File :', _pretrained_file)
            saved_file_path = os.path.join(SAVE_DIR, 'checkpoints',
                                           _pretrained_file)
        elif pretrained_file is False:
            # Find the pretrained file
            __fname = '*' + model_obj.model_signature + '*' + _match + '*.pb'
            try:
                saved_file_path = glob.glob(
                    os.path.join(SAVE_DIR, 'checkpoints', __fname))[0]
            except:
                saved_file_path = None

        if saved_file_path is not None:
            model_obj.set_pretrained_model_file(saved_file_path)
        else:
            model_obj.train_model(_x)

    elif _use_pretrained is False:
        model_obj.train_model(_x)

    _ep = entity_prob_test[idx]
    if CONFIG[_DIR]['w_mean']:
        mean_embeddings = model_obj.get_w_embedding_mean(_x, _ep)
    else:
        mean_embeddings = model_obj.get_embedding_mean(_x)

    _test_all_id = test_all_id[idx]
    _all_ids = list(train_ids)
    _all_ids.extend(list(_test_all_id))

    anomalies = test_anom_id[idx]
    print('Number of true anomalies', len(anomalies))

    # ---------------------
    # USE LOF here
    # ---------------------

    sorted_id_score_dict = lof_1.anomaly_1(id_list=_all_ids,
                                           embed_list=mean_embeddings)

    _scored_dict_test = OrderedDict(sorted_id_score_dict)

    if eval_type == 1:
        recall, precison = evaluation_v1.precision_recall_curve(
            _scored_dict_test, anomaly_id_list=anomalies)
    elif eval_type == 2:
        recall, precison = evaluation_v2.precision_recall_curve(
            _scored_dict_test, anomaly_id_list=anomalies)

    # test_result_r.append(recall)
    # test_result_p.append(precison)
    cur_auc = auc(recall, precison)
    print('AUC ::', cur_auc)
    print('--------------------------')
    return cur_auc, recall, precison
def main(_dir=None):
    global DATA_DIR
    global _DIR
    global config
    global OP_DIR
    global DATA_X
    global DISCARD_0

    _dir = _args['_dir']
    k_val = _args['k_val']

    DISCARD_0 = _args['discard_0']

    setup(_dir)
    _DATA_X, test_anom_id, test_all_id, test_x = get_data()
    DATA_X = _DATA_X
    K = int(config['K'])
    # override
    if k_val is not None:
        K = k_val
    print(k_val)

    N = DATA_X.shape[0]
    obj_ADTree = ad_tree_v1.ADT()
    obj_ADTree.setup(DATA_X)

    attribute_list = list(range(DATA_X.shape[1]))
    print('Attribute list', attribute_list)

    attribute_set_pairs = get_attribute_sets(attribute_list, obj_ADTree, k=K)

    print(attribute_set_pairs)
    print(' Number of attribute set pairs ', len(attribute_set_pairs))

    # Testing phase

    number_CV = len(test_all_id)
    for n in range(number_CV):
        start = time.time()
        test_data = test_x[n]
        id_list = test_all_id[n]
        anom_id_list = test_anom_id[n]
        result_dict = {}

        results = []
        for _id, record in zip(id_list, test_data):
            a = get_r_value(_id, record, obj_ADTree, attribute_set_pairs, N)
            results.append(a)

        for e in results:
            result_dict[e[0]] = e[1]

        end = time.time()
        print('-----------------------')
        print(_DIR)
        print('k = ', K)
        print(' Time taken :', end - start)
        # save file
        SAVE_FILE_OP = '_'.join([
            'result_alg_1_test_' + str(n), _DIR,
            str(time.time()).split('.')[0]
        ]) + '.pkl'

        SAVE_FILE_OP_PATH = os.path.join(DATA_DIR, SAVE_FILE_OP)
        with open(SAVE_FILE_OP_PATH, 'wb') as fh:
            pickle.dump(result_dict, fh, pickle.HIGHEST_PROTOCOL)

        tmp = sorted(result_dict.items(), key=operator.itemgetter(1))
        sorted_id_score_dict = OrderedDict()
        for e in tmp:
            sorted_id_score_dict[e[0]] = e[1]

        print('--------------------------')

        # Plot the distribution of r values
        _y = list(sorted(list(result_dict.values())))
        _x = list(range(len(_y)))

        plt.figure(figsize=[14, 8])
        plt.plot(_x, _y, color='red', linewidth=1.5)
        plt.xlabel('Samples (sorted)', fontsize=15)
        plt.ylabel('Decision value r', fontsize=15)

        f_name = 'r_vals' + '_K_' + str(K) + '_test_' + str(
            n) + '_discard_0_' + str(DISCARD_0) + '.png'
        f_path = os.path.join(OP_DIR, f_name)

        plt.savefig(f_path)
        plt.close()
        # -------------------------------#

        print('--------------------------')

        recall, precison = evaluation_v1.precision_recall_curve(
            sorted_id_score_dict, anomaly_id_list=anom_id_list)

        _auc = auc(recall, precison)
        plt.figure(figsize=[14, 8])
        plt.plot(recall, precison, color='blue', linewidth=1.75)
        plt.xlabel('Recall', fontsize=15)
        plt.ylabel('Precision', fontsize=15)
        plt.title('Recall | AUC ' + str(_auc), fontsize=15)
        f_name = 'precison-recall_1' + '_K_' + str(K) + '_test_' + str(
            n) + '_discard_0_' + str(DISCARD_0) + '.png'
        f_path = os.path.join(OP_DIR, f_name)
        plt.savefig(f_path)
        plt.close()

        print('----------------------------')

        x, y = evaluation_v1.performance_by_score(sorted_id_score_dict,
                                                  anom_id_list)

        plt.figure(figsize=[14, 8])
        plt.plot(x, y, color='red', linewidth=1.75)
        # plt.xlabel(' ', fontsize=15)
        plt.ylabel('Percentage of anomalies detected', fontsize=15)
        plt.title('Lowest % of scores', fontsize=15)
        f_name = 'score_1_test_' + str(n) + '.png'
        f_path = os.path.join(OP_DIR, f_name)
        plt.savefig(f_path)
        plt.close()
def main():
    global _TIME_IT
    global _DIR
    global OP_DIR
    global SAVE_DIR
    global config
    setup()


    if not os.path.exists(SAVE_DIR):
        os.mkdir(SAVE_DIR)
        if os.path.exists(os.path.join(SAVE_DIR, _DIR)):
            os.mkdir(os.path.join(SAVE_DIR, _DIR))

    checkpoint_dir = os.path.join(SAVE_DIR)
    print(os.getcwd())

    # data_x, test_anom_id, test_all_id, test_x =
    data_x, data_x_id, test_anom_id, test_all_id, test_x = get_data()
    count_test_sets = min(len(test_x),1)

    test_result_r = []
    test_result_p = []
    res = None
    time_arr = []
    auc_arr = []
    for i in range(count_test_sets):
        start_time = time.time()

        _x = test_x[i]
        _x = np.vstack([data_x, _x])

        test_ids = test_all_id[i]
        print(' >> ', len(test_ids))
        _x_id = list(data_x_id)
        _x_id.extend(test_ids)

        print(_x.shape)
        # _x = _x[:2000, :4]
        # _x_id = _x_id[:2000]

        print(_x.shape)
        print(len(_x_id))
        print(_x)

        # known anomalies
        anomaly_ids = test_anom_id[i]

        # ---- Core ------ #
        _df_input = []
        for _j in range(_x.shape[0]):
            _df_input.append(list(_x[_j]))

        cols = ['f' + str(j) for j in range(_x.shape[1])]
        X = pd.DataFrame(
            _df_input,
            columns=cols,
            index=[_j for _j in range(_x.shape[0])],
            dtype='category'
        )

        estimator = CompreX(logging_level=logging.ERROR)
        estimator.transform(X)
        estimator.fit(X)
        res = estimator.predict(X)
        '''
            'res' is ordered in the order of the input
            match it with the ordered list of ids
        '''
        anomaly_scores = list(res)
        anomaly_score_dict = { k:v for k,v in zip(_x_id,anomaly_scores) }

        # --------------- #
        ''' 
        Sort in reverse order, since higher score means anomaly 
        '''

        tmp = sorted(
            anomaly_score_dict.items(),
            key=operator.itemgetter(1),
            reverse=True
        )

        sorted_id_score_dict = OrderedDict()
        for e in tmp:
            sorted_id_score_dict[e[0]] = e[1]

        recall, precison = eval.precision_recall_curve(
            sorted_id_score_dict,
            anomaly_id_list=anomaly_ids
        )
        end_time = time.time()
        time_taken = end_time - start_time
        _auc = auc(recall, precison)

        print('Test case ', i , 'Time taken [seconds]', time_taken , 'AUC',  _auc)
        print('--------------------------')
        time_arr.append(time_taken)
        auc_arr.append(_auc)



    print('=================')
    print('Avg AUC :', np.mean(auc_arr))
    print('Avg time', np.mean(time_taken))


    '''
    if _TIME_IT == False:
    _auc = auc(recall, precison)
    print('AUC', _auc)
    plt.figure(figsize=[14, 8])
    plt.plot(
        recall,
        precison,
        color='blue', linewidth=1.75)

    plt.xlabel('Recall', fontsize=15)
    plt.ylabel('Precision', fontsize=15)
    plt.title('Recall | AUC ' + str(_auc), fontsize=15)
    f_name = 'precison-recall_1_test_' + str(i) + '.png'
    f_path = os.path.join(OP_DIR, f_name)

    # plt.savefig(f_path)
    test_result_r.append(recall)
    test_result_p.append(precison)
    plt.close()

    
    '''
    print('----------------------------')
    '''