def main(argv): global _DIR global OP_DIR global SAVE_DIR if not os.path.exists(SAVE_DIR): os.mkdir(SAVE_DIR) if os.path.exists(os.path.join(SAVE_DIR, _DIR)): os.mkdir(os.path.join(SAVE_DIR, _DIR)) checkpoint_dir = os.path.join(SAVE_DIR) print(os.getcwd()) data_x, test_anom_id, test_all_id, test_x = get_data() data, inp_dims = get_training_data(data_x, FLAGS.neg_samples) num_domains = len(inp_dims) model_obj = APE_tf_model_1.model_ape_1() model_obj.set_model_params(num_entities=num_domains, inp_dims=inp_dims, neg_samples=FLAGS.neg_samples, batch_size=FLAGS.batchsize, num_epochs=FLAGS.num_epochs, chkpt_dir=checkpoint_dir) model_obj.set_hyper_parameters(emb_dims=[10], use_bias=[True, False]) print(FLAGS.use_pretrained) if FLAGS.use_pretrained is False: model_obj.build_model() model_obj.train_model(data) test_result_r = [] test_result_p = [] res = None for i in range(len(test_x)): _x = test_x[i] _x = np.vstack([_x, data_x]) res = model_obj.inference(_x) all_ids = test_all_id[i] anomalies = test_anom_id[i] _id_score_dict = {id: res for id, res in zip(all_ids, res)} tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1)) sorted_id_score_dict = OrderedDict() for e in tmp: sorted_id_score_dict[e[0]] = e[1] recall, precison = evaluation_v1.precision_recall_curve( sorted_id_score_dict, anomaly_id_list=anomalies) print('--------------------------') from sklearn.metrics import auc _auc = auc(recall, precison) plt.figure(figsize=[14, 8]) plt.plot(recall, precison, color='blue', linewidth=1.75) plt.xlabel('Recall', fontsize=15) plt.ylabel('Precision', fontsize=15) plt.title('Recall | AUC ' + str(_auc), fontsize=15) f_name = 'precison-recall_1_test_' + str(i) + '.png' f_path = os.path.join(OP_DIR, f_name) # plt.savefig(f_path) test_result_r.append(recall) test_result_p.append(precison) plt.close() print('----------------------------') x, y = evaluation_v1.performance_by_score(sorted_id_score_dict, anomalies) plt.figure(figsize=[14, 8]) plt.plot(x, y, color='red', linewidth=1.75) # plt.xlabel(' ', fontsize=15) plt.ylabel('Percentage of anomalies detected', fontsize=15) plt.title('Lowest % of scores', fontsize=15) f_name = 'score_1_test_' + str(i) + '.png' f_path = os.path.join(OP_DIR, f_name) plt.savefig(f_path) plt.close() plt.figure(figsize=[14, 8]) j = 1 mean_auc = 0 for _x, _y in zip(test_result_r, test_result_p): plt.plot(_x, _y, linewidth=1.75, label='Test set ' + str(j)) j += 1 _auc = auc(_x, _y) print(_auc) mean_auc += _auc mean_auc = mean_auc / len(test_result_r) print('Mean ', mean_auc) plt.xlabel('Recall', fontsize=15) plt.ylabel('Precision', fontsize=15) plt.title('Precision Recall Curve', fontsize=17) plt.legend(loc='best') plt.show() plt.close() plt.figure(figsize=[14, 8]) plt.title('Distribution of scores in Model 2', fontsize=17) plt.ylabel('Scores', fontsize=15) plt.xlabel('Samples', fontsize=15) _y = list(sorted(res)) _x = list(range(len(_y))) plt.plot(_x, _y, linewidth=1.75) plt.show() plt.close()
def main(argv): global _TIME_IT global _DIR global OP_DIR global SAVE_DIR global config setup() if not os.path.exists(SAVE_DIR): os.mkdir(SAVE_DIR) if os.path.exists(os.path.join(SAVE_DIR, _DIR)): os.mkdir(os.path.join(SAVE_DIR, _DIR)) checkpoint_dir = os.path.join(SAVE_DIR) print(os.getcwd()) data_x, data_x_id, test_anom_id, test_all_id, test_x = get_data() count_test_sets = len(test_x) test_result_r = [] test_result_p = [] res = None start_time = time.time() for i in range(count_test_sets): train_data_x = np.vstack([data_x, test_x[i]]) data, inp_dims = get_training_data(train_data_x, config[_DIR]['neg_samples'], index=i) num_domains = len(inp_dims) model_obj = APE_tf_model_1.model_ape_1(MODEL_NAME) model_obj.set_model_params(num_entities=num_domains, inp_dims=inp_dims, neg_samples=config[_DIR]['neg_samples'], batch_size=config[_DIR]['batch_size'], num_epochs=config[_DIR]['num_epocs'], lr=config[_DIR]['learning_rate'], chkpt_dir=checkpoint_dir) _emb_size = int(config[_DIR]['embed_size']) model_obj.set_hyper_parameters(emb_dims=[_emb_size], use_bias=[True, False]) _use_pretrained = config[_DIR]['use_pretrained'] if _use_pretrained is False: model_obj.build_model() model_obj.train_model(data) ''' join the normal data + anomaly data join the normal data id + anomaly data id Maintain order ''' _x = np.vstack([test_x[i], data_x]) _x_id = list(test_all_id[i]) _x_id.extend(data_x_id) res = model_obj.inference(_x) # Known anomalies anomalies = test_anom_id[i] _id_score_dict = {id: res for id, res in zip(_x_id, res)} ''' sort by ascending since lower likelihood means anomalous ''' tmp = sorted(_id_score_dict.items(), key=operator.itemgetter(1)) sorted_id_score_dict = OrderedDict() for e in tmp: sorted_id_score_dict[e[0]] = e[1] recall, precison = eval.precision_recall_curve( sorted_id_score_dict, anomaly_id_list=anomalies) from sklearn.metrics import auc _auc = auc(recall, precison) print('AUC', _auc) print('--------------------------') ''' if _TIME_IT == False: _auc = auc(recall, precison) print('AUC', _auc) plt.figure(figsize=[14, 8]) plt.plot( recall, precison, color='blue', linewidth=1.75) plt.xlabel('Recall', fontsize=15) plt.ylabel('Precision', fontsize=15) plt.title('Recall | AUC ' + str(_auc), fontsize=15) f_name = 'precison-recall_1_test_' + str(i) + '.png' f_path = os.path.join(OP_DIR, f_name) # plt.savefig(f_path) test_result_r.append(recall) test_result_p.append(precison) plt.close() ''' print('----------------------------') end_time = time.time() avg_time = (end_time - start_time) / count_test_sets all_auc = [] plt.figure(figsize=[14, 8]) j = 1 for _x, _y in zip(test_result_r, test_result_p): plt.plot(_x, _y, linewidth=1.75, label='Test set ' + str(j)) j += 1 _auc = auc(_x, _y) print(_auc) all_auc.append(_auc) mean_auc = np.mean(all_auc) print('Mean AUC', mean_auc) print(" ======================== ") ''' plt.xlabel('Recall', fontsize=15) plt.ylabel('Precision', fontsize=15) plt.title('Precision Recall Curve', fontsize=17) plt.legend(loc='best') # plt.show() plt.close() ''' ''' plt.figure(figsize=[14, 8]) plt.title('Distribution of scores in Model 2', fontsize=17) plt.ylabel('Scores', fontsize=15) plt.xlabel('Samples', fontsize=15) _y = list(sorted(res)) _x = list(range(len(_y))) plt.plot( _x, _y, linewidth=1.75 ) # plt.show() plt.close() ''' # ------------------------------------ # Save the results # ------------------------------------ _dict = { 'mean_auc': mean_auc, 'all_auc': ';'.join([str(_) for _ in all_auc]), 'time': avg_time } for k, v in config[_DIR]: _dict[k] = str(v) _dict = {k: [v] for k, v in _dict.items()} df = pd.DataFrame(_dict) res_fname = 'ape_result_v2' + str(time.time()).split('.')[0] + '.csv' df.to_csv(os.path.join(OP_DIR, res_fname)) if _TIME_IT: print('Time Taken :', avg_time)
def main(argv=None): global embedding_dims global SAVE_DIR global _DIR global DATA_DIR global config global CONFIG_FILE global MODEL_NAME global DOMAIN_DIMS with open(CONFIG_FILE) as f: config = yaml.safe_load(f) _DIR = config['_DIR'] DATA_DIR = config['DATA_DIR'] + '/' + _DIR setup_general_config() if not os.path.exists(os.path.join(SAVE_DIR, 'checkpoints')): os.mkdir(os.path.join(SAVE_DIR, 'checkpoints')) # ------------ # data_x, test_anom_id, test_all_id, test_x, train_ids = get_data() DOMAIN_DIMS = get_domain_dims() model_obj = set_up_model() _use_pretrained = FLAGS.use_pretrained if _use_pretrained is False: model_obj.train_model(data_x) if _use_pretrained is True: pretrained_file = None if config['saved_model_file'] is None: if FLAGS.saved_model_file is not None: pretrained_file = FLAGS.saved_model_file else: pretrained_file = config['saved_model_file'] print('Pretrained File :', pretrained_file) print('Saved file ::', FLAGS.saved_model_file) saved_file_path = os.path.join( SAVE_DIR, 'checkpoints', pretrained_file ) model_obj.set_pretrained_model_file(saved_file_path) test_result_r = [] test_result_p = [] for i in range(len(test_x)-1): # combine the test and train data - since it is a density based method _x = np.vstack([data_x, test_x[i]]) mean_embeddings = model_obj.get_embedding_mean(_x) print(data_x.shape[0], test_x[i].shape[0], _x.shape[0] ,mean_embeddings.shape[0]) _test_all_id = test_all_id[i] _all_ids = list(train_ids) _all_ids.extend(list(_test_all_id)) anomalies = test_anom_id[i] # USE LOF here sorted_id_score_dict = lof_1.anomaly_1( id_list=_all_ids, embed_list=mean_embeddings ) print(' >>>> ', len(sorted_id_score_dict)) _scored_dict_test = {} for k1,v in sorted_id_score_dict.items(): if k1 in _test_all_id or k1 in _scored_dict_test: _scored_dict_test[k1] = v recall, precison = evaluation_v1.precision_recall_curve( _scored_dict_test, anomaly_id_list=anomalies ) test_result_r.append(recall) test_result_p.append(precison) print('--------------------------') _auc = auc(recall, precison) plt.figure(figsize=[14, 8]) plt.plot( recall, precison, color='blue', linewidth=1.75) plt.xlabel('Recall', fontsize=15) plt.ylabel('Precision', fontsize=15) plt.title('Recall | AUC ' + str(_auc), fontsize=15) f_name = 'precison-recall_1_test_' + str(i) + '.png' f_path = os.path.join(OP_DIR, f_name) # plt.savefig(f_path) plt.close() plt.figure(figsize=[14, 8]) j = 1 res_str = 'auPR : ' for _x,_y in zip(test_result_r,test_result_p): plt.plot( _x, _y, linewidth=1.75, label='Test set ' + str(j) ) j += 1 _auc = auc(_x, _y) res_str = ' ' + "{0:.2f}".format(_auc) print(_auc) plt.xlabel('Recall', fontsize=15) plt.ylabel('Precision', fontsize=15) plt.title('Precision Recall Curve ' + res_str, fontsize=18) plt.legend(loc='best') f_name = 'precison-recall_test_' + str(i) + '.png' f_path = os.path.join(OP_DIR, f_name) plt.savefig(f_path) plt.show() plt.close()
def process(idx, CONFIG, _DIR, data_x, test_x, train_ids, test_all_id, test_anom_id, test_SerialID, entity_prob_test, eval_type): model_obj = set_up_model(CONFIG, _DIR) _x = np.vstack([data_x, test_x[idx]]) model_obj.set_SerialID(test_SerialID[idx]) _use_pretrained = CONFIG[_DIR]['use_pretrained'] if _use_pretrained is True: saved_file_path = None pretrained_file = CONFIG[_DIR]['saved_model_file'] if type(pretrained_file) == list: _match = '_serialID_' + str(test_SerialID[idx]) _pretrained_file = None _match = '_serialID_' + str(test_SerialID[idx]) if type(pretrained_file) == list: # search for the one that matches test_SerialID for _p in pretrained_file: if _match in _p: _pretrained_file = _p break print('Pretrained File :', _pretrained_file) saved_file_path = os.path.join(SAVE_DIR, 'checkpoints', _pretrained_file) elif pretrained_file is False: # Find the pretrained file __fname = '*' + model_obj.model_signature + '*' + _match + '*.pb' try: saved_file_path = glob.glob( os.path.join(SAVE_DIR, 'checkpoints', __fname))[0] except: saved_file_path = None if saved_file_path is not None: model_obj.set_pretrained_model_file(saved_file_path) else: model_obj.train_model(_x) elif _use_pretrained is False: model_obj.train_model(_x) _ep = entity_prob_test[idx] if CONFIG[_DIR]['w_mean']: mean_embeddings = model_obj.get_w_embedding_mean(_x, _ep) else: mean_embeddings = model_obj.get_embedding_mean(_x) _test_all_id = test_all_id[idx] _all_ids = list(train_ids) _all_ids.extend(list(_test_all_id)) anomalies = test_anom_id[idx] print('Number of true anomalies', len(anomalies)) # --------------------- # USE LOF here # --------------------- sorted_id_score_dict = lof_1.anomaly_1(id_list=_all_ids, embed_list=mean_embeddings) _scored_dict_test = OrderedDict(sorted_id_score_dict) if eval_type == 1: recall, precison = evaluation_v1.precision_recall_curve( _scored_dict_test, anomaly_id_list=anomalies) elif eval_type == 2: recall, precison = evaluation_v2.precision_recall_curve( _scored_dict_test, anomaly_id_list=anomalies) # test_result_r.append(recall) # test_result_p.append(precison) cur_auc = auc(recall, precison) print('AUC ::', cur_auc) print('--------------------------') return cur_auc, recall, precison
def main(_dir=None): global DATA_DIR global _DIR global config global OP_DIR global DATA_X global DISCARD_0 _dir = _args['_dir'] k_val = _args['k_val'] DISCARD_0 = _args['discard_0'] setup(_dir) _DATA_X, test_anom_id, test_all_id, test_x = get_data() DATA_X = _DATA_X K = int(config['K']) # override if k_val is not None: K = k_val print(k_val) N = DATA_X.shape[0] obj_ADTree = ad_tree_v1.ADT() obj_ADTree.setup(DATA_X) attribute_list = list(range(DATA_X.shape[1])) print('Attribute list', attribute_list) attribute_set_pairs = get_attribute_sets(attribute_list, obj_ADTree, k=K) print(attribute_set_pairs) print(' Number of attribute set pairs ', len(attribute_set_pairs)) # Testing phase number_CV = len(test_all_id) for n in range(number_CV): start = time.time() test_data = test_x[n] id_list = test_all_id[n] anom_id_list = test_anom_id[n] result_dict = {} results = [] for _id, record in zip(id_list, test_data): a = get_r_value(_id, record, obj_ADTree, attribute_set_pairs, N) results.append(a) for e in results: result_dict[e[0]] = e[1] end = time.time() print('-----------------------') print(_DIR) print('k = ', K) print(' Time taken :', end - start) # save file SAVE_FILE_OP = '_'.join([ 'result_alg_1_test_' + str(n), _DIR, str(time.time()).split('.')[0] ]) + '.pkl' SAVE_FILE_OP_PATH = os.path.join(DATA_DIR, SAVE_FILE_OP) with open(SAVE_FILE_OP_PATH, 'wb') as fh: pickle.dump(result_dict, fh, pickle.HIGHEST_PROTOCOL) tmp = sorted(result_dict.items(), key=operator.itemgetter(1)) sorted_id_score_dict = OrderedDict() for e in tmp: sorted_id_score_dict[e[0]] = e[1] print('--------------------------') # Plot the distribution of r values _y = list(sorted(list(result_dict.values()))) _x = list(range(len(_y))) plt.figure(figsize=[14, 8]) plt.plot(_x, _y, color='red', linewidth=1.5) plt.xlabel('Samples (sorted)', fontsize=15) plt.ylabel('Decision value r', fontsize=15) f_name = 'r_vals' + '_K_' + str(K) + '_test_' + str( n) + '_discard_0_' + str(DISCARD_0) + '.png' f_path = os.path.join(OP_DIR, f_name) plt.savefig(f_path) plt.close() # -------------------------------# print('--------------------------') recall, precison = evaluation_v1.precision_recall_curve( sorted_id_score_dict, anomaly_id_list=anom_id_list) _auc = auc(recall, precison) plt.figure(figsize=[14, 8]) plt.plot(recall, precison, color='blue', linewidth=1.75) plt.xlabel('Recall', fontsize=15) plt.ylabel('Precision', fontsize=15) plt.title('Recall | AUC ' + str(_auc), fontsize=15) f_name = 'precison-recall_1' + '_K_' + str(K) + '_test_' + str( n) + '_discard_0_' + str(DISCARD_0) + '.png' f_path = os.path.join(OP_DIR, f_name) plt.savefig(f_path) plt.close() print('----------------------------') x, y = evaluation_v1.performance_by_score(sorted_id_score_dict, anom_id_list) plt.figure(figsize=[14, 8]) plt.plot(x, y, color='red', linewidth=1.75) # plt.xlabel(' ', fontsize=15) plt.ylabel('Percentage of anomalies detected', fontsize=15) plt.title('Lowest % of scores', fontsize=15) f_name = 'score_1_test_' + str(n) + '.png' f_path = os.path.join(OP_DIR, f_name) plt.savefig(f_path) plt.close()
def main(): global _TIME_IT global _DIR global OP_DIR global SAVE_DIR global config setup() if not os.path.exists(SAVE_DIR): os.mkdir(SAVE_DIR) if os.path.exists(os.path.join(SAVE_DIR, _DIR)): os.mkdir(os.path.join(SAVE_DIR, _DIR)) checkpoint_dir = os.path.join(SAVE_DIR) print(os.getcwd()) # data_x, test_anom_id, test_all_id, test_x = data_x, data_x_id, test_anom_id, test_all_id, test_x = get_data() count_test_sets = min(len(test_x),1) test_result_r = [] test_result_p = [] res = None time_arr = [] auc_arr = [] for i in range(count_test_sets): start_time = time.time() _x = test_x[i] _x = np.vstack([data_x, _x]) test_ids = test_all_id[i] print(' >> ', len(test_ids)) _x_id = list(data_x_id) _x_id.extend(test_ids) print(_x.shape) # _x = _x[:2000, :4] # _x_id = _x_id[:2000] print(_x.shape) print(len(_x_id)) print(_x) # known anomalies anomaly_ids = test_anom_id[i] # ---- Core ------ # _df_input = [] for _j in range(_x.shape[0]): _df_input.append(list(_x[_j])) cols = ['f' + str(j) for j in range(_x.shape[1])] X = pd.DataFrame( _df_input, columns=cols, index=[_j for _j in range(_x.shape[0])], dtype='category' ) estimator = CompreX(logging_level=logging.ERROR) estimator.transform(X) estimator.fit(X) res = estimator.predict(X) ''' 'res' is ordered in the order of the input match it with the ordered list of ids ''' anomaly_scores = list(res) anomaly_score_dict = { k:v for k,v in zip(_x_id,anomaly_scores) } # --------------- # ''' Sort in reverse order, since higher score means anomaly ''' tmp = sorted( anomaly_score_dict.items(), key=operator.itemgetter(1), reverse=True ) sorted_id_score_dict = OrderedDict() for e in tmp: sorted_id_score_dict[e[0]] = e[1] recall, precison = eval.precision_recall_curve( sorted_id_score_dict, anomaly_id_list=anomaly_ids ) end_time = time.time() time_taken = end_time - start_time _auc = auc(recall, precison) print('Test case ', i , 'Time taken [seconds]', time_taken , 'AUC', _auc) print('--------------------------') time_arr.append(time_taken) auc_arr.append(_auc) print('=================') print('Avg AUC :', np.mean(auc_arr)) print('Avg time', np.mean(time_taken)) ''' if _TIME_IT == False: _auc = auc(recall, precison) print('AUC', _auc) plt.figure(figsize=[14, 8]) plt.plot( recall, precison, color='blue', linewidth=1.75) plt.xlabel('Recall', fontsize=15) plt.ylabel('Precision', fontsize=15) plt.title('Recall | AUC ' + str(_auc), fontsize=15) f_name = 'precison-recall_1_test_' + str(i) + '.png' f_path = os.path.join(OP_DIR, f_name) # plt.savefig(f_path) test_result_r.append(recall) test_result_p.append(precison) plt.close() ''' print('----------------------------') '''