def __init__(self): self.train_reader = Reader.Reader("dataset/data/segmented_set1/*.tfr") self.test_reader = Reader.Reader("dataset/data/segmented_set2/*.tfr") self.logs_dir = FLAGS.logdir self.train_logs_path = self.logs_dir + '/train_logs' self.test_logs_path = self.logs_dir + '/test_logs' self.chkpt_file = self.logs_dir + "/model.ckpt" self.ten_accuracy = [] self.epoch_accuracy = [] if FLAGS.test is True: self.is_training = False self._evaluate_test() else: self.is_training = True self._evaluate_train()
def __init__(self): self.train_reader = Reader.Reader("train_tf/*.tfrecord") self.logs_dir = FLAGS.logdir self.train_logs_path = self.logs_dir + '/train_logs' self.GRU_SIZE = 1620 self.cell_size = 9 self.max_batch_size = 60 self.record_N = 256 #self.chkpt_file = self.logs_dir + "/model.ckpt-54000" self.h_state_init_1 = np.zeros((1, self.GRU_SIZE), np.float32) self.h_state_init_2 = np.zeros((self.max_batch_size, self.GRU_SIZE), np.float32) #self.cell_state_init = np.zeros(4096).reshape(1,4096).astype(np.float32) # self.prev_asscoia = np.zeros((self.max_batch_size, self.record_N * (self.cell_size * self.cell_size+1)), np.float32) self.is_training = True self._evaluate_train()
def OurModel(ratio, kkkk, neighbors): params.lam1 = 0.0 params.lam2 = 1.0 # Dataset parameters data_path = params.data_dir_FB params.num_neighbor = neighbors params.batch_size = 1024 # params.BiLSTM_input_size = 50 # params.BiLSTM_hidden_size = 50 # params.input_size_lstm = 50 # params.hidden_size_lstm = 50 # params.embedding_dim = 50 # params.anomaly_ratio = ratio params.anomaly_ratio = ratio model_name = "OurLSTM" data_name = "FB15K_FULL_" dataset = Reader(data_path, isInjectTopK=False) all_triples = dataset.train_data labels = dataset.labels train_idx = list(range(len(all_triples) // 2)) num_iterations = math.ceil(dataset.num_triples_with_anomalies / params.batch_size) total_num_anomalies = dataset.num_anomalies logging.basicConfig(level=logging.INFO) file_handler = logging.FileHandler( os.path.join( params.log_folder, model_name + "_" + data_name + "_" + str(ratio) + "_" + str(params.kkkkk) + "_Neighbors" + str(neighbors) + "_" + "_log.txt")) logger = logging.getLogger() logger.addHandler(file_handler) logging.info('There are %d Triples with %d anomalies in the graph.' % (len(dataset.labels), total_num_anomalies)) params.total_ent = dataset.num_entity params.total_rel = dataset.num_relation model_saved_path = model_name + "_" + data_name + "_" + str( ratio) + ".ckpt" model_saved_path = os.path.join(params.out_folder, model_saved_path) # model.load_state_dict(torch.load(model_saved_path)) # Model BiLSTM_Attention model = BiLSTM_Attention(params.input_size_lstm, params.hidden_size_lstm, params.num_layers_lstm, params.dropout, params.alpha).to(params.device) criterion = nn.MarginRankingLoss(params.gama) optimizer = torch.optim.Adam(model.parameters(), lr=params.learning_rate) # for k in range(kkkk): for it in range(num_iterations): batch_h, batch_r, batch_t, batch_size = get_pair_batch_train_common( dataset, it, train_idx, params.batch_size, params.num_neighbor) batch_h = torch.LongTensor(batch_h).to(params.device) batch_t = torch.LongTensor(batch_t).to(params.device) batch_r = torch.LongTensor(batch_r).to(params.device) # input_triple, batch_size = get_pair_batch_train_common(dataset, it, train_idx, # params.batch_size, # params.num_neighbor) # input_triple = Variable(torch.LongTensor(input_triple).cuda()) # batch_size = input_triples.size(0) out, out_att = model(batch_h, batch_r, batch_t) out = out.reshape(batch_size, -1, 2 * 3 * params.BiLSTM_hidden_size) out_att = out_att.reshape(batch_size, -1, 2 * 3 * params.BiLSTM_hidden_size) pos_h = out[:, 0, :] pos_z0 = out_att[:, 0, :] pos_z1 = out_att[:, 1, :] neg_h = out[:, 1, :] neg_z0 = out_att[:, 2, :] neg_z1 = out_att[:, 3, :] # loss function # positive pos_loss = params.lam1 * torch.norm(pos_z0 - pos_z1, p=2, dim=1) + \ params.lam2 * torch.norm(pos_h[:, 0:2 * params.BiLSTM_hidden_size] + pos_h[:, 2 * params.BiLSTM_hidden_size:2 * 2 * params.BiLSTM_hidden_size] - pos_h[:, 2 * 2 * params.BiLSTM_hidden_size:2 * 3 * params.BiLSTM_hidden_size], p=2, dim=1) # negative neg_loss = params.lam1 * torch.norm(neg_z0 - neg_z1, p=2, dim=1) + \ params.lam2 * torch.norm(neg_h[:, 0:2 * params.BiLSTM_hidden_size] + neg_h[:, 2 * params.BiLSTM_hidden_size:2 * 2 * params.BiLSTM_hidden_size] - neg_h[:, 2 * 2 * params.BiLSTM_hidden_size:2 * 3 * params.BiLSTM_hidden_size], p=2, dim=1) y = -torch.ones(batch_size).to(params.device) loss = criterion(pos_loss, neg_loss, y) optimizer.zero_grad() loss.backward() optimizer.step() pos_loss_value = torch.sum(pos_loss) / (batch_size * 2.0) neg_loss_value = torch.sum(neg_loss) / (batch_size * 2.0) logging.info('There are %d Triples in this batch.' % batch_size) logging.info('Epoch: %d-%d, pos_loss: %f, neg_loss: %f, Loss: %f' % (k, it + 1, pos_loss_value.item(), neg_loss_value.item(), loss.item())) torch.save(model.state_dict(), model_saved_path) # # # # dataset = Reader(data_path, "test") model1 = BiLSTM_Attention(params.input_size_lstm, params.hidden_size_lstm, params.num_layers_lstm, params.dropout, params.alpha).to(params.device) model1.load_state_dict(torch.load(model_saved_path)) model1.eval() with torch.no_grad(): all_loss = [] all_label = [] all_pred = [] start_id = 0 # epochs = int(len(dataset.bp_triples_label) / 100) # 2720 for i in range(num_iterations): batch_h, batch_r, batch_t, labels, start_id, batch_size = get_pair_batch_test( dataset, params.batch_size, params.num_neighbor, start_id) # labels = labels.unsqueeze(1) # batch_size = input_triples.size(0) batch_h = torch.LongTensor(batch_h).to(params.device) batch_t = torch.LongTensor(batch_t).to(params.device) batch_r = torch.LongTensor(batch_r).to(params.device) labels = labels.to(params.device) out, out_att = model1(batch_h, batch_r, batch_t) out_att = out_att.reshape(batch_size, 2, 2 * 3 * params.BiLSTM_hidden_size) out_att_view0 = out_att[:, 0, :] out_att_view1 = out_att[:, 1, :] # [B, 600] [B, 600] loss = params.lam1 * torch.norm(out_att_view0 - out_att_view1, p=2, dim=1) + \ params.lam2 * torch.norm(out[:, 0:2 * params.BiLSTM_hidden_size] + out[:, 2 * params.BiLSTM_hidden_size:2 * 2 * params.BiLSTM_hidden_size] - out[:, 2 * 2 * params.BiLSTM_hidden_size:2 * 3 * params.BiLSTM_hidden_size], p=2, dim=1) # pos = torch.ones(params.batch_size).to(params.device) # neg = torch.zeros(params.batch_size).to(params.device) # pred = torch.where(loss < 7., pos, neg) # # total += labels.size(0) # correct = (pred == labels).sum().item() # print('Test Accuracy of the model on the 100 test images: {} %'.format(1.0 * correct / labels.size(0))) # print(loss.shape) # print(labels.shape) # all_pred += pred all_loss += loss all_label += labels # print('{}th test data'.format(i)) logging.info('[Train] Evaluation on %d batch of Original graph' % i) # sum = labels.sum() # if sum < labels.size(0): # # loss = -1 * loss # AUC = roc_auc_score(labels.cpu(), loss.cpu()) # print('AUC on the {}th test images: {} %'.format(i, np.around(AUC))) total_num = len(all_label) # print("Total number of test tirples: ", total_num) # AUC11 = roc_auc_score(toarray(all_label), toarray_float(all_loss)) # logging.info('[Train] AUC of %d triples: %f' % (total_num, AUC11)) # print('AUC of {} triples: {}'.format(num_epochs * 100, np.around(AUC11, 4))) # correct = (toarray(all_pred) == toarray(all_label)).sum().item() # print('Accuracy of {} triples:{} %'.format(epochs * 100, 1 - 1.0 * correct / len(all_label))) # _, top1000_indices = torch.topk(toarray(all_loss), 1000, largest=True, sorted=False) # num_k = int(ratios[-1] * dataset.num_original_triples) # top_loss, top_indices = torch.topk(toarray_float(all_loss), num_k, largest=True, sorted=True) # for i in range(len(ratios)): # num_k = int(ratios[i] * dataset.num_original_triples) # # top_loss, top_indices = torch.topk(toarray_float(all_loss), num_k, largest=True, sorted=True) # top_labels = toarray([all_label[top_indices[iii]] for iii in range(len(top_indices))]) # top_sum = top_labels.sum() # recall = top_sum * 1.0 / total_num_anomalies # precision = top_sum * 1.0 / num_k # # logging.info('[Train][%s][%s] Precision %f -- %f : %f' % (data_name, model_name, ratio, ratios[i], precision)) # logging.info('[Train][%s][%s] Recall %f-- %f : %f' % (data_name, model_name, ratio, ratios[i], recall)) # logging.info('[Train][%s][%s] anomalies in total: %d -- discovered:%d -- K : %d' % ( # data_name, model_name, total_num_anomalies, top_sum, num_k)) # # if top_sum.item() < num_k and top_sum.item() != 0: # # print(top_sum.item(), num_k) # AUC_K = roc_auc_score(toarray(top_labels), toarray_float(top_loss)) # logging.info('[Train][%s][%s] xxxxxxxxxxxxxxxxxxxxxxx AUC %f -- %f : %f' % ( # data_name, model_name, ratio, ratios[i], AUC_K)) max_top_k = total_num_anomalies * 2 min_top_k = total_num_anomalies // 10 top_loss, top_indices = torch.topk(toarray_float(all_loss), max_top_k, largest=True, sorted=True) top_labels = toarray( [all_label[top_indices[iii]] for iii in range(len(top_indices))]) anomaly_discovered = [] for i in range(max_top_k): if i == 0: anomaly_discovered.append(top_labels[i]) else: anomaly_discovered.append(anomaly_discovered[i - 1] + top_labels[i]) results_interval_10 = np.array( [anomaly_discovered[i * 10] for i in range(max_top_k // 10)]) # print(results_interval_10) logging.info('[Train] final results: %s' % str(results_interval_10)) top_k = np.arange(1, max_top_k + 1) assert len(top_k) == len( anomaly_discovered), 'The size of result list is wrong' precision_k = np.array(anomaly_discovered) / top_k recall_k = np.array(anomaly_discovered) * 1.0 / total_num_anomalies precision_interval_10 = [ precision_k[i * 10] for i in range(max_top_k // 10) ] # print(precision_interval_10) logging.info('[Train] final Precision: %s' % str(precision_interval_10)) recall_interval_10 = [recall_k[i * 10] for i in range(max_top_k // 10)] # print(recall_interval_10) logging.info('[Train] final Recall: %s' % str(recall_interval_10)) logging.info('parameter lambda :%s' % str(params.lam2)) logging.info('K = %d' % kkkk) ratios = [ 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15 ] for i in range(len(ratios)): num_k = int(ratios[i] * dataset.num_original_triples) if num_k > len(anomaly_discovered): break recall = anomaly_discovered[num_k - 1] * 1.0 / total_num_anomalies precision = anomaly_discovered[num_k - 1] * 1.0 / num_k logging.info('[Train][%s][%s] Precision %f -- %f : %f' % (data_name, model_name, ratio, ratios[i], precision)) logging.info('[Train][%s][%s] Recall %f-- %f : %f' % (data_name, model_name, ratio, ratios[i], recall)) logging.info( '[Train][%s][%s] anomalies in total: %d -- discovered:%d -- K : %d' % (data_name, model_name, total_num_anomalies, anomaly_discovered[num_k - 1], num_k))
def Triplenet(ratio, kkkk, data): # Dataset parameters params.batch_size = 2048 params.lam2 = 1.0 params.lam1 = 0 data_path = data params.num_neighbor = 5 params.anomaly_ratio = ratio model_name = "TripleNet_local_2048" if data_path == params.data_dir_DBPEDIA: data_name = "DBpedia" params.batch_size = 3600 else: data_name = "NELL" dataset = Reader(data_path, isInjectTopK=False) all_triples = dataset.train_data labels = dataset.labels train_idx = list(range(len(all_triples) // 2)) num_iterations = math.ceil(dataset.num_triples_with_anomalies / params.batch_size) total_num_anomalies = dataset.num_anomalies logging.basicConfig(level=logging.INFO) file_handler = logging.FileHandler( os.path.join( params.log_folder, model_name + "_" + data_name + "_" + str(ratio) + "_" + "_log.txt")) logger = logging.getLogger() logger.addHandler(file_handler) logging.info('There are %d Triples with %d anomalies in the graph.' % (len(dataset.labels), total_num_anomalies)) params.total_ent = dataset.num_entity params.total_rel = dataset.num_relation model_saved_path = model_name + "_" + data_name + "_" + str( ratio) + ".ckpt" model_saved_path = os.path.join(params.out_folder, model_saved_path) # model.load_state_dict(torch.load(model_saved_path)) # Model BiLSTM_Attention model = BiLSTM_Attention(params.input_size_lstm, params.hidden_size_lstm, params.num_layers_lstm, params.dropout, params.alpha).to(params.device) criterion = nn.MarginRankingLoss(params.gama) optimizer = torch.optim.Adam(model.parameters(), lr=params.learning_rate) # for k in range(kkkk): for it in range(num_iterations): batch_h, batch_r, batch_t, batch_size = get_pair_batch_train_common( dataset, it, train_idx, params.batch_size, params.num_neighbor) batch_h = torch.LongTensor(batch_h).to(params.device) batch_t = torch.LongTensor(batch_t).to(params.device) batch_r = torch.LongTensor(batch_r).to(params.device) # input_triple, batch_size = get_pair_batch_train_common(dataset, it, train_idx, # params.batch_size, # params.num_neighbor) # input_triple = Variable(torch.LongTensor(input_triple).cuda()) # batch_size = input_triples.size(0) out, out_att = model(batch_h, batch_r, batch_t) out = out.reshape(batch_size, -1, 2 * 3 * params.BiLSTM_hidden_size) out_att = out_att.reshape(batch_size, -1, 2 * 3 * params.BiLSTM_hidden_size) pos_h = out[:, 0, :] pos_z0 = out_att[:, 0, :] # pos_z1 = out_att[:, 1, :] neg_h = out[:, 1, :] neg_z0 = out_att[:, 1, :] # neg_z1 = out_att[:, 3, :] # loss function # positive pos_loss = params.lam1 * torch.norm(pos_h - pos_z0, p=2, dim=1) + \ params.lam2 * torch.norm(pos_h[:, 0:2 * params.BiLSTM_hidden_size] + pos_h[:, 2 * params.BiLSTM_hidden_size:2 * 2 * params.BiLSTM_hidden_size] - pos_h[:, 2 * 2 * params.BiLSTM_hidden_size:2 * 3 * params.BiLSTM_hidden_size], p=2, dim=1) # negative neg_loss = params.lam1 * torch.norm(neg_h - neg_z0, p=2, dim=1) + \ params.lam2 * torch.norm(neg_h[:, 0:2 * params.BiLSTM_hidden_size] + neg_h[:, 2 * params.BiLSTM_hidden_size:2 * 2 * params.BiLSTM_hidden_size] - neg_h[:, 2 * 2 * params.BiLSTM_hidden_size:2 * 3 * params.BiLSTM_hidden_size], p=2, dim=1) y = -torch.ones(batch_size).to(params.device) loss = criterion(pos_loss, neg_loss, y) optimizer.zero_grad() loss.backward() optimizer.step() pos_loss_value = torch.sum(pos_loss) / (batch_size * 2.0) neg_loss_value = torch.sum(neg_loss) / (batch_size * 2.0) logging.info('There are %d Triples in this batch.' % batch_size) logging.info('Epoch: %d-%d, pos_loss: %f, neg_loss: %f, Loss: %f' % (k, it + 1, pos_loss_value.item(), neg_loss_value.item(), loss.item())) torch.save(model.state_dict(), model_saved_path) # # # # dataset = Reader(data_path, "test") model1 = BiLSTM_Attention(params.input_size_lstm, params.hidden_size_lstm, params.num_layers_lstm, params.dropout, params.alpha).to(params.device) model1.load_state_dict(torch.load(model_saved_path)) model1.eval() with torch.no_grad(): all_loss = [] all_label = [] start_id = 0 for i in range(num_iterations): batch_h, batch_r, batch_t, labels, start_id, batch_size = get_pair_batch_test( dataset, params.batch_size, params.num_neighbor, start_id) # labels = labels.unsqueeze(1) # batch_size = input_triples.size(0) batch_h = torch.LongTensor(batch_h).to(params.device) batch_t = torch.LongTensor(batch_t).to(params.device) batch_r = torch.LongTensor(batch_r).to(params.device) labels = labels.to(params.device) out, out_att = model1(batch_h, batch_r, batch_t) # out_att = out_att.reshape(batch_size, 2, 2 * 3 * params.BiLSTM_hidden_size) out_att_view0 = out_att # out_att_view1 = out_att[:, 1, :] # [B, 600] [B, 600] loss = params.lam1 * torch.norm(out_att_view0 - out, p=2, dim=1) + \ params.lam2 * torch.norm(out[:, 0:2 * params.BiLSTM_hidden_size] + out[:, 2 * params.BiLSTM_hidden_size:2 * 2 * params.BiLSTM_hidden_size] - out[:, 2 * 2 * params.BiLSTM_hidden_size:2 * 3 * params.BiLSTM_hidden_size], p=2, dim=1) all_loss += loss all_label += labels logging.info('[Train] Evaluation on %d batch of Original graph' % i) total_num = len(all_label) # print("Total number of test tirples: ", total_num) AUC11 = roc_auc_score(toarray(all_label), toarray_float(all_loss)) logging.info('[Train] AUC of %d triples: %f' % (total_num, AUC11)) ratios = [ 0.001, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15 ] for i in range(len(ratios)): num_k = int(ratios[i] * dataset.num_original_triples) top_loss, top_indices = torch.topk(toarray_float(all_loss), num_k, largest=True, sorted=True) top_labels = toarray([ all_label[top_indices[iii]] for iii in range(len(top_indices)) ]) top_sum = top_labels.sum() recall = top_sum * 1.0 / total_num_anomalies precision = top_sum * 1.0 / num_k logging.info('[Train][%s][%s] Precision %f -- %f : %f' % (data_name, model_name, ratio, ratios[i], precision)) logging.info('[Train][%s][%s] Recall %f-- %f : %f' % (data_name, model_name, ratio, ratios[i], recall)) logging.info( '[Train][%s][%s] anomalies in total: %d -- discovered:%d -- K : %d' % (data_name, model_name, total_num_anomalies, top_sum, num_k)) if top_sum.item() < num_k and top_sum.item() != 0: # print(top_sum.item(), num_k) AUC_K = roc_auc_score(toarray(top_labels), toarray_float(top_loss)) logging.info( '[Train][%s][%s] xxxxxxxxxxxxxxxxxxxxxxx AUC %f -- %f : %f' % (data_name, model_name, ratio, ratios[i], AUC_K)) max_top_k = total_num_anomalies * 2 # min_top_k = total_num_anomalies // 10 top_loss, top_indices = torch.topk(toarray_float(all_loss), max_top_k, largest=True, sorted=True) top_labels = toarray( [all_label[top_indices[iii]] for iii in range(len(top_indices))]) anomaly_discovered = [] for i in range(max_top_k): if i == 0: anomaly_discovered.append(top_labels[i]) else: anomaly_discovered.append(anomaly_discovered[i - 1] + top_labels[i]) results_interval_10 = np.array( [anomaly_discovered[i * 10] for i in range(max_top_k // 10)]) logging.info('[Train] final results: %s' % str(results_interval_10)) top_k = np.arange(1, max_top_k + 1) assert len(top_k) == len( anomaly_discovered), 'The size of result list is wrong' precision_k = np.array(anomaly_discovered) / top_k recall_k = np.array(anomaly_discovered) * 1.0 / total_num_anomalies precision_interval_10 = [ precision_k[i * 10] for i in range(max_top_k // 10) ] # print(precision_interval_10) logging.info('[Train] final Precision: %s' % str(precision_interval_10)) recall_interval_10 = [recall_k[i * 10] for i in range(max_top_k // 10)] # print(recall_interval_10) logging.info('[Train] final Recall: %s' % str(recall_interval_10))