def train_and_save(args, data): rauc = np.zeros(args.runs) ap = np.zeros(args.runs) if args.mode == "unsupervised": real_ano = 0 if args.mode == "semi_supervised": real_ano = int(args.known_outliers / 2) if args.mode == "supervised": real_ano = args.known_outliers print("Mode :", args.mode, "Total outliers :", known_outliers, "Real outliers", real_ano) for i in range(args.runs): x_train, y_train = get_train(data, args) devnet = DevNet(**args.__dict__) devnet.fit(x_train, y_train) scores = devnet.decision_function(data.X_val) rauc[i], ap[i] = aucPerformance(scores, data.Y_val) mean_aucpr = np.mean(ap) std_aucpr = np.std(ap) df = pd.DataFrame.from_dict(args.__dict__, orient="index").T df["real_anomaly"] = real_ano df["mean_auc"] = mean_aucpr df["std_auc"] = std_aucpr try: pd.read_csv(args.output) df.to_csv(args.output, mode="a", index=False, header=False) except: df.to_csv(args.output, mode="a", index=False)
def evaluating(self, x_test, y_test): net = deepcopy(self.net) y_pred = net(x_test) # loss = dev_loss(y_test, y_pred) # print(loss) y_test = y_test.detach().cpu().numpy() y_pred = y_pred.detach().cpu().numpy() # fpr, tpr, roc_auc = dict(), dict(), dict() # for i in range(2): # fpr[i], tpr[i], _ = roc_curve(y_test, y_pred, pos_label=1) # roc_auc[i] = auc(fpr[i], tpr[i]) auc_roc, auc_pr, ap = aucPerformance(y_test, y_pred) print(auc_roc) del net return auc_roc, auc_pr, ap
def train_and_save(args, data): rauc = np.zeros(args.runs) ap = np.zeros(args.runs) print("Mode :", args.mode, "Rep dim :", args.rep_dim, "L2 weight :", args.weight_decay) for i in range(args.runs): d_svdd = DSVDD(**args.__dict__) d_svdd.fit(data.X_train, verbose=False) scores = d_svdd.decision_function(data.X_val) rauc[i], ap[i] = aucPerformance(scores, data.Y_val) mean_aucpr = np.mean(ap) std_aucpr = np.std(ap) df = pd.DataFrame.from_dict(args.__dict__, orient="index").T df["mean_auc"] = mean_aucpr df["std_auc"] = std_aucpr try: pd.read_csv(args.output) df.to_csv(args.output, mode="a", index=False, header=False) except: df.to_csv(args.output, mode="a", index=False)
def run_t(args): names = ['x_train_1w_50percent'] network_depth = int(args.network_depth) random_seed = args.ramdn_seed for nm in names: runs = args.runs rauc = np.zeros(runs) ap = np.zeros(runs) filename = nm.strip() x, labels = dataLoading_np(args.input_path + filename + ".npy") outlier_indices = np.where(labels == 1)[0] outliers = x[outlier_indices] n_outliers_org = outliers.shape[0] train_time = 0 test_time = 0 for i in np.arange(runs): x_train, x_test, y_train, y_test = train_test_split( x, labels, test_size=0.2, random_state=42, stratify=labels) print('x_train', x_train.shape, type(x_train)) print('y_train', y_train.shape, type(y_train)) print('x_test', x_test.shape, type(x_test)) print('y_test', y_test.shape, type(y_test)) y_train = np.array(y_train) y_test = np.array(y_test) print(filename + ': round ' + str(i)) outlier_indices = np.where(y_train == 1)[0] inlier_indices = np.where(y_train == 0)[0] n_outliers = len(outlier_indices) print("Original training size: %d, No. outliers: %d" % (x_train.shape[0], n_outliers)) rng = np.random.RandomState(random_seed) if n_outliers > args.known_outliers: mn = n_outliers - args.known_outliers remove_idx = rng.choice(outlier_indices, mn, replace=False) x_train = np.delete(x_train, remove_idx, axis=0) y_train = np.delete(y_train, remove_idx, axis=0) outlier_indices = np.where(y_train == 1)[0] inlier_indices = np.where(y_train == 0)[0] print('training samples num:', y_train.shape[0], 'outlier num:', outlier_indices.shape[0], 'inlier num:', inlier_indices.shape[0]) input_shape = x_train.shape[1:] n_samples_trn = x_train.shape[0] n_outliers = len(outlier_indices) print("Training data size: %d, No. outliers: %d" % (x_train.shape[0], n_outliers)) start_time = time.time() input_shape = x_train.shape[1:] epochs = args.epochs batch_size = args.batch_size nb_batch = args.nb_batch model = neural_network(input_shape, network_depth) print(model.summary()) model_filename = filename + "_" + str( args.batch_size) + "bs_" + str( args.known_outliers) + "ko_" + str(network_depth) + "d.h5" model_name = os.path.join('../model/train_', model_filename) checkpointer = ModelCheckpoint(model_name, monitor='loss', verbose=0, save_best_only=True, save_weights_only=True) model.fit_generator(batch_generator_sup(x_train, outlier_indices, inlier_indices, batch_size, nb_batch, rng), steps_per_epoch=nb_batch, epochs=epochs, callbacks=[checkpointer]) train_time += time.time() - start_time start_time = time.time() scores = load_model_weight_predict(model_name, input_shape, network_depth, x_test) test_time += time.time() - start_time print(scores.shape) rauc[i], ap[i] = aucPerformance(scores, y_test) preds = scores class_one = preds > 0.5 predic_class = np.where(class_one == True, 1, 0) precision_new = precision_score(y_test, predic_class) print('new precision', precision_new) recall_new = recall_score(y_test, predic_class) print('new recall', recall_new) f1_new = 2 * ((precision_new * recall_new) / (precision_new + recall_new)) print('f1 new', f1_new) fig3 = plt.figure() plt.plot(model.history.history['loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'validation'], loc='upper left') plt.show() fig3.savefig('my_figure3.png') mean_auc = np.mean(rauc) std_auc = np.std(rauc) mean_aucpr = np.mean(ap) std_aucpr = np.std(ap) train_time = train_time / runs test_time = test_time / runs print("average AUC-ROC: %.4f, average AUC-PR: %.4f" % (mean_auc, mean_aucpr)) print("average runtime: %.4f seconds" % (train_time + test_time)) architecture = '2 hidlstm(64)+1dense(32)+ 1dense(1)' losses = 'binary-cross-entropy' max_precision = 0 max_recall = 0 f_one = 0 writeResults(filename + '_' + str(network_depth), losses, x.shape[0], x.shape[1], n_samples_trn, n_outliers_org, n_outliers, network_depth, mean_auc, mean_aucpr, std_auc, std_aucpr, train_time, test_time, architecture, epochs, batch_size, nb_batch, precision_new, recall_new, f1_new, max_precision, max_recall, f_one, path=args.output)
import warnings model_path = '/content/gdrive/My Drive/devnet/DevNet/DevNet-master copy 2/model/devnet_/x_train_1w_50percent_512bs_943ko_1d.h5' network_depth = 1 input_shape = [1, 298] x_test = np.load( '/content/gdrive/My Drive/devnet/DevNet/DevNet-master copy 2/dataset/x_test_1w_50percent.npy' ) y_test = np.load( '/content/gdrive/My Drive/devnet/DevNet/DevNet-master copy 2/dataset/y_test_1w_50percent.npy' ) scores = load_model_weight_predict(model_path, input_shape=input_shape, network_depth=network_depth, x_test=x_test) preds = scores class_one = preds > 0.5 predic_class = np.where(class_one == True, 1, 0) precision_new = precision_score(y_test, predic_class) print('precision', precision_new) recall_new = recall_score(y_test, predic_class) print('recall', recall_new) f1_new = 2 * ((precision_new * recall_new) / (precision_new + recall_new)) print('f1', f1_new) AUC_ROC, AUC_PR = aucPerformance(scores, y_test)
def run_devnet(args): names = args.data_set.split(',') names = ['creditcard_train'] network_depth = int(args.network_depth) random_seed = args.ramdn_seed for nm in names: runs = args.runs rauc = np.zeros(runs) ap = np.zeros(runs) filename = nm.strip() global data_format data_format = int(args.data_format) if data_format == 0: x, labels = dataLoading(args.input_path + filename + ".csv") else: x, labels = get_data_from_svmlight_file(args.input_path + filename + ".svm") x = x.tocsr() outlier_indices = np.where(labels == 1)[0] outliers = x[outlier_indices] n_outliers_org = outliers.shape[0] train_time = 0 test_time = 0 for i in np.arange(runs): x_train, x_test, y_train, y_test = train_test_split( x, labels, test_size=0.2, random_state=42, stratify=labels) y_train = np.array(y_train) y_test = np.array(y_test) print(filename + ': round ' + str(i)) outlier_indices = np.where(y_train == 1)[0] inlier_indices = np.where(y_train == 0)[0] n_outliers = len(outlier_indices) print("Original training size: %d, No. outliers: %d" % (x_train.shape[0], n_outliers)) n_noise = len(np.where( y_train == 0)[0]) * args.cont_rate / (1. - args.cont_rate) n_noise = int(n_noise) rng = np.random.RandomState(random_seed) if data_format == 0: if n_outliers > args.known_outliers: mn = n_outliers - args.known_outliers remove_idx = rng.choice(outlier_indices, mn, replace=False) x_train = np.delete(x_train, remove_idx, axis=0) y_train = np.delete(y_train, remove_idx, axis=0) noises = inject_noise(outliers, n_noise, random_seed) x_train = np.append(x_train, noises, axis=0) y_train = np.append(y_train, np.zeros((noises.shape[0], 1))) else: if n_outliers > args.known_outliers: mn = n_outliers - args.known_outliers remove_idx = rng.choice(outlier_indices, mn, replace=False) retain_idx = set(np.arange( x_train.shape[0])) - set(remove_idx) retain_idx = list(retain_idx) x_train = x_train[retain_idx] y_train = y_train[retain_idx] noises = inject_noise_sparse(outliers, n_noise, random_seed) x_train = vstack([x_train, noises]) y_train = np.append(y_train, np.zeros((noises.shape[0], 1))) outlier_indices = np.where(y_train == 1)[0] inlier_indices = np.where(y_train == 0)[0] print('training samples num:', y_train.shape[0], 'outlier num:', outlier_indices.shape[0], 'inlier num:', inlier_indices.shape[0], 'noise num:', n_noise) n_samples_trn = x_train.shape[0] n_outliers = len(outlier_indices) print("Training data size: %d, No. outliers: %d" % (x_train.shape[0], n_outliers)) start_time = time.time() input_shape = x_train.shape[1:] epochs = args.epochs batch_size = args.batch_size nb_batch = args.nb_batch model = deviation_network(input_shape, network_depth) print(model.summary()) model_name = "./model/devnet_" + filename + "_" + str( args.cont_rate) + "cr_" + str(args.batch_size) + "bs_" + str( args.known_outliers) + "ko_" + str(network_depth) + "d.h5" checkpointer = ModelCheckpoint(model_name, monitor='loss', verbose=0, save_best_only=True, save_weights_only=True) model.fit_generator(batch_generator_sup(x_train, outlier_indices, inlier_indices, batch_size, nb_batch, rng), steps_per_epoch=nb_batch, epochs=epochs, callbacks=[checkpointer]) train_time += time.time() - start_time start_time = time.time() scores = load_model_weight_predict(model_name, input_shape, network_depth, x_test) test_time += time.time() - start_time rauc[i], ap[i] = aucPerformance(scores, y_test) mean_auc = np.mean(rauc) std_auc = np.std(rauc) mean_aucpr = np.mean(ap) std_aucpr = np.std(ap) train_time = train_time / runs test_time = test_time / runs print("average AUC-ROC: %.4f, average AUC-PR: %.4f" % (mean_auc, mean_aucpr)) print("average runtime: %.4f seconds" % (train_time + test_time)) writeResults(filename + '_' + str(network_depth), x.shape[0], x.shape[1], n_samples_trn, n_outliers_org, n_outliers, network_depth, mean_auc, mean_aucpr, std_auc, std_aucpr, train_time, test_time, path=args.output)
def main(): torch.manual_seed(666) torch.cuda.manual_seed_all(666) np.random.seed(666) # data_name = 'BlogCatalog' print(args) nb_epochs = 50 nb_runs = 16 nb_try = 16 nb_batch_maml = 10 nb_batch = 32 lr_1 = 0.03 lr_s = lr_1 * args.task_num tr = 0.6 # features, labels, idx_train, y_train, idx_val, y_val, idx_labeled, train_unlabeled = SGC_process(data_name, degree=2, l_ratio=0.08, tr_ratio=0.8) # print(args.task_num) aucfile = 'results/auc_' + datetime.now().strftime("%m_%d_%H_%M") + '_yelp.txt' with open(aucfile, 'a') as f: f.write("settings: {labeled ratio: %f, training ratio: %f, epochs: %d, update_step: %d}\n" % (lr_1, tr, nb_epochs, args.update_step)) for t in range(nb_try): taskData = task(nb_task=args.task_num, degree=2, l_ratio=lr_1, t_ratio=tr, name='yelp') taskData.loadNProcess() f.write("target data name:" + taskData.f_name[-1] + "\n") f.write("%d-th try: \n" % t) for i in range(nb_runs): # training maml print("maml training...") print("In %d-th run..." % (i + 1)) f.write("%d-th run\n" % i) feature_list, label, l_list, ul_list, idx_test = taskData.sampleAnomaly() config = modelArch(feature_list[0].shape[1], args.n_way) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") maml = Meta(args, config).to(device) # stats of parameters to be updated tmp = filter(lambda x: x.requires_grad, maml.parameters()) num = sum(map(lambda x: np.prod(x.shape), tmp)) # print(maml) print("Total #trainable tensors: ", num) batch_gen = DataLoaderN(feature_list, l_list, ul_list, b_size=8, b_size_qry=6, nb_task=args.task_num, device=device) maml.train() for e in range(1, nb_epochs + 1): print("Running %d-th epoch" % e) epoch_loss = 0 epoch_acc = 0 for b in range(nb_batch_maml): x_train, y_train, x_qry, y_qry = batch_gen.getBatch(qry=False) y_pred, loss = maml(x_train, y_train, x_qry, y_qry) epoch_loss += loss print("Epoch loss: %f" % epoch_loss) print("End of training.") # testing print("Evaluating the maml model") maml.eval() x_test, y_test = feature_list[args.task_num-1][idx_test].to(device), label[idx_test].to(device) auc_roc, auc_pr, ap = maml.evaluating(x_test, y_test) print("End of evaluating.") f.write("MAML auc_roc: %.5f, auc_pr: %.5f, ap: %.5f\n" % (auc_roc, auc_pr, ap)) # g-dev training print('G-dev training...') features, labels, idx_labeled, idx_unlabeled, idx_test = SGC_process(taskData.target, degree=2, l_ratio=lr_s, tr_ratio=tr) # print("finish loading data...") attr_dim = features.shape[1] # print("%d-th run:" % i) # model = FCNet(attr_dim, 1).to(device) # model = SGC_original(attr_dim, 1).to(device) model = SGC(attr_dim, 1).to(device) # print(model) optim = torch.optim.Adam(model.parameters(), lr=0.002, weight_decay=0) # loss = deviation_loss() data_sampler = DataLoader(features, idx_labeled, idx_unlabeled, b_size=8) model.float() model.train() for e in range(1, nb_epochs + 1): # print('Epoch: %d' % e) epoch_loss = 0 epoch_acc = 0 for b in range(nb_batch): x_b, y_b = data_sampler.getBatch() x_b, y_b = x_b.to(device), y_b.to(device) y_pred = model(x_b) loss = deviation_loss(y_b, y_pred) optim.zero_grad() loss.backward() optim.step() epoch_loss += loss.item() print("epoch loss %f" % epoch_loss) # validation model.eval() # print(idx_val.shape) x_val = features[idx_test].to(device) # print(x_val.shape) y_pred = model(x_val).detach().cpu().numpy() y_val = labels[idx_test].detach().cpu().numpy() # fpr, tpr, roc_auc = dict(), dict(), dict() # for i in range(2): # fpr[i], tpr[i], _ = roc_curve(y_val, y_pred, pos_label=1) # roc_auc[i] = auc(fpr[i], tpr[i]) auc_roc, _, auc_pr = aucPerformance(y_val, y_pred) print("G-dev auc_roc: %.5f, auc_pr: %.5f" % (auc_roc, auc_pr)) f.write("G-Dev auc_roc: %.5f, auc_pr: %.5f\n" % (auc_roc, auc_pr)) f.close()
def run_devnet(args): print("Chosen mode :", args.mode) nm = 'fraud' network_depth = int(args.network_depth) random_seed = args.ramdn_seed runs = args.runs rauc = np.zeros(runs) ap = np.zeros(runs) filename = nm.strip() global data_format data_format = int(args.data_format) data = Dataset(mode="other") if args.mode =="unsupervised" : outlier_scores = lesinn(data.X_train, data.X_train) ind_scores = np.argsort(outlier_scores.flatten()) inlier_ids, outlier_ids = ind_scores[:-args.known_outliers:], ind_scores[-args.known_outliers:] inlier_ids = np.intersect1d(inlier_ids, np.where(data.Y_train == 0)[0]) #print("Original training size: %d, No. outliers: %d" % (x_train.shape[0], # n_outliers)) train_time = 0 test_time = 0 for i in np.arange(runs): print(filename + ': round ' + str(i)) x_train, x_test, y_train, y_test = data.X_train, data.X_val, data.Y_train, data.Y_val if args.mode == "unsupervised" : y_train[inlier_ids] = 0; y_train[outlier_ids] = 1 outlier_indices = np.where(y_train == 1)[0] outliers = x_train[outlier_indices] n_outliers_org = outliers.shape[0] inlier_indices = np.where(y_train == 0)[0] n_outliers = len(outlier_indices) n_noise = len(np.where(y_train == 0)[0]) * args.cont_rate / (1. - args.cont_rate) n_noise = int(n_noise) rng = np.random.RandomState(random_seed) if data_format == 0: if n_outliers > args.known_outliers: mn = n_outliers - args.known_outliers remove_idx = rng.choice(outlier_indices, mn, replace=False) x_train = np.delete(x_train, remove_idx, axis=0) y_train = np.delete(y_train, remove_idx, axis=0) if args.cont_rate > 0 : noises = inject_noise(outliers, n_noise, random_seed) x_train = np.append(x_train, noises, axis = 0) y_train = np.append(y_train, np.zeros((noises.shape[0], 1))) outlier_indices = np.where(y_train == 1)[0] inlier_indices = np.where(y_train == 0)[0] #print(y_train.shape[0], outlier_indices.shape[0], inlier_indices.shape[0], n_noise) input_shape = x_train.shape[1:] n_samples_trn = x_train.shape[0] n_outliers = len(outlier_indices) print("Training data size: %d, No. outliers: %d" % (x_train.shape[0], n_outliers)) start_time = time.time() input_shape = x_train.shape[1:] epochs = args.epochs batch_size = args.batch_size nb_batch = args.nb_batch model = deviation_network(input_shape) #print(model.summary()) model_name = "./model/" + args.mode + "_" + str(args.cont_rate) + "cr_" + str(args.known_outliers) +"d.h5" checkpointer = ModelCheckpoint(model_name, monitor='loss', verbose=0, save_best_only = True, save_weights_only = True) model.fit_generator(batch_generator_sup(x_train, outlier_indices, inlier_indices, batch_size, nb_batch, rng), steps_per_epoch = nb_batch, epochs = epochs, callbacks=[checkpointer], verbose = True) train_time += time.time() - start_time start_time = time.time() scores = load_model_weight_predict(model_name, input_shape, network_depth, x_test) test_time += time.time() - start_time rauc[i], ap[i] = aucPerformance(scores, y_test) mean_auc = np.mean(rauc) #std_auc = np.std(rauc) mean_aucpr = np.mean(ap) std_aucpr = np.std(ap) train_time = train_time/runs test_time = test_time/runs print("average AUC-ROC: %.4f, average AUC-PR: %.4f" % (mean_auc, mean_aucpr)) #print("average runtime: %.4f seconds" % (train_time + test_time)) writeResults(filename+'_vrai_'+str(network_depth), n_samples_trn, n_outliers_org, n_outliers, mean_aucpr, std_aucpr, args.cont_rate, path=args.output)