def predict(embedding_path = "data/acceptor_hs3d/IE.{}"): arch = 'bert' true_embeddings = embedding(None, embedding_path.format(1), arch) false_embeddings = embedding(None, embedding_path.format(0), arch)[:len(true_embeddings)] print(true_embeddings) print(false_embeddings) # do a train test split train_1, test_1 = train_test_split(true_embeddings) train_0, test_0 = train_test_split(false_embeddings) print("# of train_0: {}".format(len(train_0))) print("# of train_1: {}".format(len(train_1))) print("# of test_0: {}".format(len(test_0))) print("# of test_1: {}".format(len(test_1))) # clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-5, verbose = 1) clf = SVC(kernel = 'linear', gamma = 'scale', verbose = True) train_x = np.concatenate([train_0, train_1], axis = 0) test_x = np.concatenate([test_0, test_1], axis = 0) train_y = np.array([0] * len(train_0) + [1] * len(train_1)) test_y = np.array([0] * len(test_0) + [1] * len(test_1)) clf.fit(train_x, train_y) preds = clf.predict(test_x) print(np.sum(preds)) true_p = np.mean(preds[test_y == 1]) false_p = np.mean(1 - preds[test_y == 1]) print('ACC: {:.4f} TP: {:.4f} FP: {:.4f}'.format(np.mean(preds == test_y), true_p, false_p)) pass
def construct_datasets(arch='bert'): embedding_path = "data/acceptor_hs3d/IE.{}" true_akpt, false_akpt = prepare_raw_datasets() true_embeddings = embedding(true_akpt, embedding_path.format(1), arch, False) false_embeddings = embedding(false_akpt, embedding_path.format(0), arch, False) return
def main(): embedding(list(open(DS_PATH.format('train'))), "/DATACENTER/data/pxd/bert_privacy/data/medical.train.x", ARCH) embedding(list(open(DS_PATH.format('test'))), "/DATACENTER/data/pxd/bert_privacy/data/medical.test.x", ARCH) train_loader = get_dataloader(EMB_PATH, "train", ARCH, BATCH_SIZE) test_loader = get_dataloader(EMB_PATH, "test", ARCH, BATCH_SIZE) # define the model and the learning procedure. Basically, a linear classifier is sufficient I guess linear_classifier = MODEL_MAP[MODEL]() if (USE_CUDA): linear_classifier = linear_classifier.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(linear_classifier.parameters(), lr=0.001, momentum=0.9) initial_acc = evaluate(test_loader, linear_classifier) print("Initial Accuracy:{:.3f}%".format(initial_acc)) for epoch in tqdm( range(EPOCH_NUM)): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate(train_loader): # get the inputs; data is a list of [inputs, labels] inputs, labels = data if (USE_CUDA): inputs, labels = inputs.cuda(), labels.cuda() # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = linear_classifier(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % PRINT_FREQ == (PRINT_FREQ - 1): # print every 2000 mini-batches print('[%d, %5d] loss: %.5f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 # do evaluation acc = evaluate(test_loader, linear_classifier) # save the model torch.save(linear_classifier.state_dict(), MODEL_SAVE_PATH.format(ARCH, MODEL)) print("After Epoch {}, {} Test Accuracy {:.3f}% ".format( epoch + 1, ARCH, acc)) print('Finished Training. Saving Model...') torch.save(linear_classifier.state_dict(), MODEL_SAVE_PATH.format(ARCH, MODEL))
def evaluate(clf, key, use_dp=False, dp_func=None, is_balanced=IS_BALANCED, verbose=VERBOSE): # load the target set f = open(TARGET_PATH, 'r') target_f = [x[:-1] for x in f if x[:-1] != ''] f.close() # print("Waiting Embedding...") target_embs = embedding(target_f, TARGET_EMB_PATH, ARCH) # print("Embedding Finished.") if (use_dp): target_embs = dp_func(target_embs) if (is_balanced): target_f, target_embs = balance(key, target_f, target_embs) results = np.zeros((2, 2)) count = 0 for i, sent in enumerate(list(target_f)): pred_ = clf.predict([target_embs[i]])[0] truth_ = int(key in sent) results[pred_][truth_] += 1 count += 1 results /= (count * 1.0) acc = results[0][0] + results[1][1] print("Target Domain Inference {} Acc: {:.3f}".format( key, results[0][0] + results[1][1])) return acc
def visualize(key): X = [] Y = [] num = 0 print("extract embedding inform\n") for i in [0, 1]: f = open(PATH.format(key, i), 'r') sents = [x[:-1] for x in f if x[:-1] != ''] embs = embedding(sents, EMB_PATH.format(key, i), ARCH) X.append(embs) num = embs.shape[0] Y.extend([i] * embs.shape[0]) # reformat the data X = np.concatenate(X, axis=0) print(X.shape) Y = np.array(Y) pca = PCA(n_components=3) mds = MDS(n_components=3) X = mds.fit_transform(X) # plot print(X.shape) fig, ax = plt.subplots(1, 1) fig = plt.figure() ax = Axes3D(fig) ax.scatter(X[:num, 0], X[:num, 1], X[:num, 2], c='b') ax.scatter(X[num:, 0], X[num:, 1], X[num:, 2], c='g') plt.savefig('visual/{}.{}.mds3.png'.format(key, ARCH)) return
def train_atk_classifier(key, size=110, verbose=VERBOSE): X_train, Y_train = [], [] # get training dataset for i in [0, 1]: f = open(DS_PATH.format(key, i) + '.txt', 'r') sents = [x[:-1] for x in f if x[:-1] != ''] embs = embedding(sents, DS_EMB_PATH.format(key, i), ARCH, key=key) embs = embs[np.random.choice(len(embs), size, replace=False), :] X_train.append(embs) Y_train.extend([i] * embs.shape[0]) f.close() X_train = np.concatenate(X_train, axis=0) Y_train = np.array(Y_train) # define clf if NONLINEAR: clf = NonLinearClassifier(EMB_DIM_TABLE[ARCH], HIDDEN_DIM) clf.to(torch.device('cpu')) else: clf = SVC(kernel='{}'.format(SVM_KERNEL), gamma='scale', verbose=False) # clf = LinearClassifier(EMB_DIM_TABLE[ARCH], HIDDEN_DIM, CLS_NUM) clf.fit(X_train, Y_train) Source_Acc = 0 if (verbose): print("TRAIN INFERENCE MODEL FROM EXTERNAL(Wiki) SOURCES (# = {})". format(len(X_train))) correct = np.sum((clf.predict(X_train) == Y_train)) Source_Acc = correct / len(Y_train) print("Source Domain(Wiki) infers #{}# Acc.: {:.4f}".format( key, Source_Acc)) return clf, Source_Acc
def get_batch(target=0, batch_size=10): batch = [gen(target) for i in range(batch_size)] z = embedding([x for x, y in batch], "tmp", ARCH, cached=False) # y = [int(y) for x, y in batch] z = torch.FloatTensor(z) # y = torch.LongTensor(y) return z, torch.LongTensor([text2seq(x) for x, y in batch])
def main(args): train_dataset, train_dataloader, validate_dataset, validate_dataloader, test_dataset, test_dataloader = baseline.load_datasets_and_dataloaders( ) embedding_matrix = util.embedding(train_dataset.text_vocab, util.config["glove_file_path"]) use_freeze = util.config["glove_file_path"] is not None embedding = torch.nn.Embedding.from_pretrained(embedding_matrix, padding_idx=0, freeze=use_freeze) # setup net input_width = 300 width = 150 output_width = 1 num_layers = 2 model = RecurrentModel('LSTM', input_width, width, output_width, num_layers) criterion = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) for epoch in range(args.epochs): print(f'----------------------------\nEpoch: {epoch}') train(model, train_dataloader, optimizer, criterion, embedding, args.clip) evaluate(model, validate_dataloader, criterion, embedding) evaluate(model, test_dataloader, criterion)
def get_batch_ground_truth(target = 0, batch_size = 10): embedding_path = "data/acceptor_hs3d/IE.{}" TRUE_PATH = "data/acceptor_hs3d/IE_true.seq" z = embedding(None, embedding_path.format(1), ARCH)[:batch_size, :] y = _extract_genomes(TRUE_PATH)[:batch_size] y = [seq2id(x[target:target+INTERVAL_LEN]) for x in y] z = torch.FloatTensor(z) y = torch.LongTensor(y) return z, y, None
def get_batch(target=0, batch_size=10): batch = [gen(target) for i in range(batch_size)] z = embedding([x for x, y in batch], "tmp", ARCH, cached=False) z = np.expand_dims(np.expand_dims(z, axis=2).reshape((batch_size, 32, 32)), axis=1) y = [int(y) for x, y in batch] z = torch.FloatTensor(z) # print(z.size()) y = torch.LongTensor(y) return z, y, [x for x, y in batch]
def get_batch_ground_truth(target=0, batch_size=10): embedding_path = "data/acceptor_hs3d/IE.{}" TRUE_PATH = "data/acceptor_hs3d/IE_true.seq" z = embedding(None, embedding_path.format(1), ARCH)[:batch_size, :] z = np.expand_dims(np.expand_dims(z, axis=2).reshape((batch_size, 32, 32)), axis=1) y = _extract_genomes(TRUE_PATH)[:batch_size] y = [seq2id(x[target:target + INTERVAL_LEN]) for x in y] z = torch.FloatTensor(z) # print(z.size()) y = torch.LongTensor(y) return z, y, None
def get_batch(target=0, batch_size=10): global CENTERS pca = PCA(n_components=2) batch = [] for i in range(batch_size): batch.extend(gen(target)) z = embedding([x for x, y in batch], "tmp", ARCH, cached=False) # to centralize the embeddings centers = [] inner_cluster_dist = [] y = [int(y) for x, y in batch] z = torch.FloatTensor(z) y = torch.LongTensor(y) return z, y, [x for x, y in batch]
def get_batch(target = 0, batch_size = 10): global PLOTTED pca = PCA(n_components=2) batch = [] for i in range(batch_size): batch.extend(gen(target)) z = embedding([x for x, y in batch], "tmp", ARCH, cached = False) # to centralize the embeddings centers = [] inner_cluster_dist = [] # for i in range(z.shape[0]//4): # c = np.mean(z[i*4:(i+1)*4, :], axis = 0) # z[i*4:(i+1)*4] = z[i*4:(i+1)*4] - c # A_vecs = [] # for k in range(4): # A_vecs.append(np.array([z[i, :] for i in range(z.shape[0]) if i % 4 == k])) # total = np.concatenate(A_vecs, axis = 0) # pca = MDS(n_components=2) # total = pca.fit_transform(total) # colors = sns.color_palette("hls", 4) # interval = len(total) // 4 # if(not PLOTTED): # for k in range(4): # plt.scatter(total[k*interval:(k+1)*interval,0], total[k*interval:(k+1)*interval,1], c = [colors[k] for i in range(interval)]) # plt.savefig('delta_mds_center.png') # PLOTTED = True # centers.append(c) # if(np.random.rand() < 0.1): # CENTERS.append(c) # collect the centers # inner_cluster_dist = np.linalg.norm(z, axis = 0) # centers = np.array(centers) # dist = pdist(centers, 'euclidean') # print("OUTER: {}".format(describe(dist))) # print("INNER: {}".format(describe(inner_cluster_dist))) # for i in range(z.shape[0]): # z[i, :] = z[i, :] - np.mean(z[i, :]) # what about the average y = [int(y) for x, y in batch] z = torch.FloatTensor(z) y = torch.LongTensor(y) return z, y, [x for x, y in batch]
def use_DANN(key): # X_train, Y_train X_train, Y_train = [], [] # get training dataset for i in [0, 1]: f = open(DS_PATH.format(key, i) + '.txt', 'r') sents = [x[:-1] for x in f if x[:-1] != ''] embs = embedding(sents, DS_EMB_PATH.format(key, i), ARCH, key) embs = embs[np.random.choice(len(embs), 110, replace=False), :] X_train.append(embs) Y_train.extend([i] * embs.shape[0]) f.close() X_train = np.concatenate(X_train, axis=0) Y_train = np.array(Y_train) # X_valid, Y_valid raw_valid, X_valid = list(open( TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH + '.' + ARCH + '.npy') X_valid_b = X_valid if (IS_BALANCED): raw_valid, X_valid = balance(key, raw_valid, X_valid) Y_valid = np.array([(key in x) for x in raw_valid]) clf = DANN(input_size=EMB_DIM_TABLE[ARCH], maxiter=DANN_MAXITER, verbose=False, name=key, batch_size=DANN_BATCH_SIZE, lambda_adapt=DANN_LAMBDA, hidden_layer_size=DANN_HIDDEN) # How to chose X_adapt? X_valid(after/before balanced), acc = clf.fit(X_train, Y_train, X_adapt=X_valid, X_valid=X_valid, Y_valid=Y_valid) return acc
def train_atk_classifier(key, size=1900): pca = None X_train, Y_train = [], [] for i in [0, 1]: f = open(PATH.format(key, i), 'r') sents = [x[:-1] for x in f if x[:-1] != ''] embs = embedding(sents, EMB_PATH.format(key, i), ARCH) if args.prefix != 'part': embs = embs[np.random.choice(len(embs), size, replace=False), :] X_train.append(embs) Y_train.extend([i] * embs.shape[0]) X_train = np.concatenate(X_train, axis=0) Y_train = np.array(Y_train) train_embs = np.load(TRAIN_EMB_PATH) # BottleNeck # X_train = np.load(TRAIN_EMB_PATH) # raw_train = list(open(TRAIN_PATH, 'r')) # if IS_BALANCED: # raw_train, X_train = balance(key, raw_train, X_train) # Y_train = np.array([(key in x) for x in raw_train]) # load validation set raw_valid, X_valid = list(open(TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH) if (key != 'potato' and IS_BALANCED): raw_valid, X_valid = balance(key, raw_valid, X_valid) print(len(raw_valid)) Y_valid = np.array([(key in x) for x in raw_valid]) acc = -1 # learn a transfer # clf = linear_model.SGDClassifier(max_iter = 1000, verbose = 0) # clf = SVC(kernel = 'rbf', gamma = 'scale', verbose = False) # clf = KNeighborsClassifier(n_neighbors=1, p = 1) if (NONLINEAR): # clf = DANN(input_size = EMB_DIM, maxiter = 2000, verbose = False, name = key, batch_size = 128) clf = DANN(input_size=EMB_DIM, maxiter=4000, verbose=True, name=key, batch_size=64, lambda_adapt=1.0, hidden_layer_size=25) acc = clf.fit(X_train, Y_train, X_adapt=train_embs, X_valid=X_valid, Y_valid=Y_valid) print("DANN Acc.: {:.4f}".format(acc)) # train_embs = train_embs[np.random.choice(len(train_embs), 2000), :] # # apply pca first # if(DO_PCA): # train_embs = train_embs[np.random.choice(len(train_embs), size = 6 * int(len(X_train)), replace = False)] # package = np.concatenate([X_train, train_embs], axis = 0) # pca = PCA(n_components=INPUT_DIM) # pca.fit(package) # X_train, train_embs = pca.transform(X_train), pca.transform(train_embs) # if NONLINEAR: # clf = NonLinearClassifier(key, ARCH, cls_num = 2, pca = pca, use_pca = DO_PCA) # clf.fit(X_train, Y_train) if NONLINEAR: clf.to(torch.device('cpu')) # on current set # correct = 0 if (VERBOSE): print("TRAIN INFERENCE MODEL FROM EXTERNAL SOURCES (# = {})".format( len(X_train))) correct = np.sum((clf.predict(X_train) == Y_train)) print("Source Domain Acc.: {:.4f}".format(correct / len(Y_train))) return clf, pca, acc
tag |= set(data_set.train_fmale.tag.tag) tag = list(tag) dct = {tg: idx for idx, tg in enumerate(tag)} train = [] label = [] ev_mat = getEmbeddingMat(50, len(tag)) ids = set(data_set.train_male.tag.id) ref = data_set.train_male.tag for i in ids: tg = ref[ref.id == i].tag seq = [0 for i in range(len(tag))] for t in tg: seq[dct[t]] = 1 seq = embedding(numpy.array(seq), ev_mat) label.append('1') train.append(seq) with open('save/train_male.txt', 'w') as f: for seq in train: s = '' for i in seq: s += str(i) + ' ' s += '\n' f.write(s) ids = set(data_set.train_fmale.tag.id) ref = data_set.train_fmale.tag for i in ids: tg = ref[ref.id == i].tag seq = [0 for i in range(len(tag))]
1): # print every 2000 mini-batches print('[%d, %5d] loss: %.5f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 # do evaluation acc = evaluate(test_loader, linear_classifier) # save the model torch.save(linear_classifier.state_dict(), MODEL_SAVE_PATH.format(ARCH, MODEL)) print("After Epoch {}, {} Test Accuracy {:.3f}% ".format( epoch + 1, ARCH, acc)) print('Finished Training. Saving Model...') torch.save(linear_classifier.state_dict(), MODEL_SAVE_PATH.format(ARCH, MODEL)) if __name__ == '__main__': # clean_raw_data(PATH, DUMP_PATH) # class_stat(DUMP_PATH) # prepare_dataset(DUMP_PATH) # for split in ["train", "test"]: # obtain_bert_embeddings(DS_PATH, EMB_PATH, split) # main() PATH = "data/medical.{}.txt" name = "data/medical.{}.x" for split in ["train", "test"]: sents = list(open(PATH.format(split), 'r')) types = ["bert", "gpt", "gpt2", "xl"] for t in types: embedding(sents, name.format(split), t, False)
def main(args): train_dataset, train_dataloader, validate_dataset, validate_dataloader, test_dataset, test_dataloader = baseline.load_datasets_and_dataloaders() embedding_matrix = util.embedding(train_dataset.text_vocab, util.config["glove_file_path"]) use_freeze = util.config["glove_file_path"] is not None embedding = torch.nn.Embedding.from_pretrained(embedding_matrix, padding_idx=0, freeze=use_freeze)
def best_cell_and_baseline_with_and_without_pretrained(train_dataset, train_dataloader, validate_dataloader, test_dataloader, clip): configs = [] RNN_config = {} RNN_config["model"] = "LSTM" RNN_config["hidden_size"] = 30 RNN_config["num_layers"] = 3 RNN_config["dropout"] = 0.9 RNN_config["bidirectional"] = True RNN_config["fc1_width"] = "//" RNN_config["fc2_width"] = "//" baseline_config = {} baseline_config["model"] = "Baseline" baseline_config["hidden_size"] = "//" baseline_config["num_layers"] = "//" baseline_config["dropout"] = "//" baseline_config["bidirectional"] = "//" baseline_config["fc1_width"] = 150 baseline_config["fc2_width"] = 150 initial_config = {} initial_config["clip"] = args.clip initial_config["epochs"] = args.epochs initial_config["input_width"] = 300 initial_config["output_width"] = 1 lstm = RNN.RecurrentModel(RNN_config["model"], initial_config["input_width"], RNN_config["hidden_size"], initial_config["output_width"], RNN_config["num_layers"], RNN_config["bidirectional"], RNN_config["dropout"]) base = BaselineModel(initial_config["input_width"], baseline_config["fc1_width"], baseline_config["fc2_width"], initial_config["output_width"]) models = [base, lstm] criterion = nn.BCEWithLogitsLoss() use_embeddings = [False, True] for use_embedding in use_embeddings: if use_embedding: file_path = util.config["glove_file_path"] else: file_path = None embedding_matrix = util.embedding(train_dataset.text_vocab, file_path) use_freeze = util.config["glove_file_path"] is not None embedding = torch.nn.Embedding.from_pretrained(embedding_matrix, padding_idx=0, freeze=use_freeze) for model in models: start = time.time() config = {} if type(model) == RNN.RecurrentModel: config.update(RNN_config) train = RNN.train evaluate = RNN.evaluate else: config.update(baseline_config) train = baseline.train evaluate = baseline.evaluate config.update(initial_config) config["pretrained"] = use_embedding print(config) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) for epoch in range(args.epochs): print(f'----------------------------\nEpoch: {epoch}') train(model, train_dataloader, optimizer, criterion, embedding, clip.clip) evaluate(model, validate_dataloader, criterion, embedding) accuracy, f1, confusion_matrix = evaluate(model, test_dataloader, criterion, embedding) config["accuracy"] = accuracy.item() config["f1"] = f1.item() config["TP"] = confusion_matrix[0, 0].item() config["FP"] = confusion_matrix[0, 1].item() config["FN"] = confusion_matrix[1, 0].item() config["TN"] = confusion_matrix[1, 1].item() end = time.time() config["time"] = end - start configs.append(config) print_to_file("4a_pretrained.xls", "Best RNN and baseline", configs)