Example #1
0
def predict(embedding_path = "data/acceptor_hs3d/IE.{}"):
    arch = 'bert'
    true_embeddings = embedding(None, embedding_path.format(1), arch)
    false_embeddings = embedding(None, embedding_path.format(0), arch)[:len(true_embeddings)]
    print(true_embeddings)
    print(false_embeddings)
    # do a train test split
    train_1, test_1 = train_test_split(true_embeddings)
    train_0, test_0 = train_test_split(false_embeddings)
    print("# of train_0: {}".format(len(train_0)))
    print("# of train_1: {}".format(len(train_1)))
    print("# of test_0: {}".format(len(test_0)))
    print("# of test_1: {}".format(len(test_1)))
    # clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-5, verbose = 1)
    clf = SVC(kernel = 'linear', gamma = 'scale', verbose = True)
    train_x = np.concatenate([train_0, train_1], axis = 0)
    test_x = np.concatenate([test_0, test_1], axis = 0)
    train_y = np.array([0] * len(train_0) + [1] * len(train_1))
    test_y = np.array([0] * len(test_0) + [1] * len(test_1))
    clf.fit(train_x, train_y)
    preds = clf.predict(test_x)
    print(np.sum(preds))
    true_p = np.mean(preds[test_y == 1])
    false_p = np.mean(1 - preds[test_y == 1])
    print('ACC: {:.4f} TP: {:.4f} FP: {:.4f}'.format(np.mean(preds == test_y), true_p, false_p))
    pass
def construct_datasets(arch='bert'):
    embedding_path = "data/acceptor_hs3d/IE.{}"
    true_akpt, false_akpt = prepare_raw_datasets()
    true_embeddings = embedding(true_akpt, embedding_path.format(1), arch,
                                False)
    false_embeddings = embedding(false_akpt, embedding_path.format(0), arch,
                                 False)
    return
Example #3
0
def main():
    embedding(list(open(DS_PATH.format('train'))),
              "/DATACENTER/data/pxd/bert_privacy/data/medical.train.x", ARCH)
    embedding(list(open(DS_PATH.format('test'))),
              "/DATACENTER/data/pxd/bert_privacy/data/medical.test.x", ARCH)
    train_loader = get_dataloader(EMB_PATH, "train", ARCH, BATCH_SIZE)
    test_loader = get_dataloader(EMB_PATH, "test", ARCH, BATCH_SIZE)

    # define the model and the learning procedure. Basically, a linear classifier is sufficient I guess
    linear_classifier = MODEL_MAP[MODEL]()
    if (USE_CUDA):
        linear_classifier = linear_classifier.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(linear_classifier.parameters(),
                          lr=0.001,
                          momentum=0.9)

    initial_acc = evaluate(test_loader, linear_classifier)
    print("Initial Accuracy:{:.3f}%".format(initial_acc))
    for epoch in tqdm(
            range(EPOCH_NUM)):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(train_loader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            if (USE_CUDA):
                inputs, labels = inputs.cuda(), labels.cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = linear_classifier(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            # print statistics
            running_loss += loss.item()
            if i % PRINT_FREQ == (PRINT_FREQ -
                                  1):  # print every 2000 mini-batches
                print('[%d, %5d] loss: %.5f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
        # do evaluation
        acc = evaluate(test_loader, linear_classifier)
        # save the model
        torch.save(linear_classifier.state_dict(),
                   MODEL_SAVE_PATH.format(ARCH, MODEL))
        print("After Epoch {}, {} Test Accuracy {:.3f}% ".format(
            epoch + 1, ARCH, acc))
    print('Finished Training. Saving Model...')
    torch.save(linear_classifier.state_dict(),
               MODEL_SAVE_PATH.format(ARCH, MODEL))
Example #4
0
def evaluate(clf,
             key,
             use_dp=False,
             dp_func=None,
             is_balanced=IS_BALANCED,
             verbose=VERBOSE):
    # load the target set
    f = open(TARGET_PATH, 'r')
    target_f = [x[:-1] for x in f if x[:-1] != '']
    f.close()
    # print("Waiting Embedding...")
    target_embs = embedding(target_f, TARGET_EMB_PATH, ARCH)
    # print("Embedding Finished.")

    if (use_dp):
        target_embs = dp_func(target_embs)
    if (is_balanced):
        target_f, target_embs = balance(key, target_f, target_embs)

    results = np.zeros((2, 2))
    count = 0

    for i, sent in enumerate(list(target_f)):
        pred_ = clf.predict([target_embs[i]])[0]
        truth_ = int(key in sent)
        results[pred_][truth_] += 1
        count += 1
    results /= (count * 1.0)
    acc = results[0][0] + results[1][1]
    print("Target Domain Inference {} Acc: {:.3f}".format(
        key, results[0][0] + results[1][1]))
    return acc
Example #5
0
def visualize(key):
    X = []
    Y = []
    num = 0
    print("extract embedding inform\n")
    for i in [0, 1]:
        f = open(PATH.format(key, i), 'r')
        sents = [x[:-1] for x in f if x[:-1] != '']
        embs = embedding(sents, EMB_PATH.format(key, i), ARCH)
        X.append(embs)
        num = embs.shape[0]
        Y.extend([i] * embs.shape[0])

    # reformat the data
    X = np.concatenate(X, axis=0)
    print(X.shape)
    Y = np.array(Y)
    pca = PCA(n_components=3)
    mds = MDS(n_components=3)
    X = mds.fit_transform(X)

    # plot
    print(X.shape)
    fig, ax = plt.subplots(1, 1)
    fig = plt.figure()
    ax = Axes3D(fig)
    ax.scatter(X[:num, 0], X[:num, 1], X[:num, 2], c='b')
    ax.scatter(X[num:, 0], X[num:, 1], X[num:, 2], c='g')
    plt.savefig('visual/{}.{}.mds3.png'.format(key, ARCH))
    return
Example #6
0
def train_atk_classifier(key, size=110, verbose=VERBOSE):
    X_train, Y_train = [], []

    # get training dataset
    for i in [0, 1]:
        f = open(DS_PATH.format(key, i) + '.txt', 'r')
        sents = [x[:-1] for x in f if x[:-1] != '']
        embs = embedding(sents, DS_EMB_PATH.format(key, i), ARCH, key=key)
        embs = embs[np.random.choice(len(embs), size, replace=False), :]
        X_train.append(embs)
        Y_train.extend([i] * embs.shape[0])
        f.close()
    X_train = np.concatenate(X_train, axis=0)
    Y_train = np.array(Y_train)

    # define clf
    if NONLINEAR:
        clf = NonLinearClassifier(EMB_DIM_TABLE[ARCH], HIDDEN_DIM)
        clf.to(torch.device('cpu'))
    else:
        clf = SVC(kernel='{}'.format(SVM_KERNEL), gamma='scale', verbose=False)
        # clf = LinearClassifier(EMB_DIM_TABLE[ARCH], HIDDEN_DIM, CLS_NUM)

    clf.fit(X_train, Y_train)

    Source_Acc = 0
    if (verbose):
        print("TRAIN INFERENCE MODEL FROM EXTERNAL(Wiki) SOURCES (# = {})".
              format(len(X_train)))
        correct = np.sum((clf.predict(X_train) == Y_train))
        Source_Acc = correct / len(Y_train)
        print("Source Domain(Wiki) infers #{}# Acc.: {:.4f}".format(
            key, Source_Acc))
    return clf, Source_Acc
Example #7
0
def get_batch(target=0, batch_size=10):
    batch = [gen(target) for i in range(batch_size)]
    z = embedding([x for x, y in batch], "tmp", ARCH, cached=False)
    # y = [int(y) for x, y in batch]
    z = torch.FloatTensor(z)
    # y = torch.LongTensor(y)
    return z, torch.LongTensor([text2seq(x) for x, y in batch])
Example #8
0
def main(args):
    train_dataset, train_dataloader, validate_dataset, validate_dataloader, test_dataset, test_dataloader = baseline.load_datasets_and_dataloaders(
    )

    embedding_matrix = util.embedding(train_dataset.text_vocab,
                                      util.config["glove_file_path"])
    use_freeze = util.config["glove_file_path"] is not None
    embedding = torch.nn.Embedding.from_pretrained(embedding_matrix,
                                                   padding_idx=0,
                                                   freeze=use_freeze)

    # setup net
    input_width = 300
    width = 150
    output_width = 1
    num_layers = 2

    model = RecurrentModel('LSTM', input_width, width, output_width,
                           num_layers)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    for epoch in range(args.epochs):
        print(f'----------------------------\nEpoch: {epoch}')
        train(model, train_dataloader, optimizer, criterion, embedding,
              args.clip)
        evaluate(model, validate_dataloader, criterion, embedding)
    evaluate(model, test_dataloader, criterion)
Example #9
0
def get_batch_ground_truth(target = 0, batch_size = 10):
    embedding_path = "data/acceptor_hs3d/IE.{}"
    TRUE_PATH = "data/acceptor_hs3d/IE_true.seq"
    z = embedding(None, embedding_path.format(1), ARCH)[:batch_size, :]
    y = _extract_genomes(TRUE_PATH)[:batch_size]
    y = [seq2id(x[target:target+INTERVAL_LEN]) for x in y]
    z = torch.FloatTensor(z)
    y = torch.LongTensor(y)
    return z, y, None
Example #10
0
def get_batch(target=0, batch_size=10):
    batch = [gen(target) for i in range(batch_size)]
    z = embedding([x for x, y in batch], "tmp", ARCH, cached=False)
    z = np.expand_dims(np.expand_dims(z, axis=2).reshape((batch_size, 32, 32)),
                       axis=1)
    y = [int(y) for x, y in batch]
    z = torch.FloatTensor(z)
    # print(z.size())
    y = torch.LongTensor(y)
    return z, y, [x for x, y in batch]
Example #11
0
def get_batch_ground_truth(target=0, batch_size=10):
    embedding_path = "data/acceptor_hs3d/IE.{}"
    TRUE_PATH = "data/acceptor_hs3d/IE_true.seq"
    z = embedding(None, embedding_path.format(1), ARCH)[:batch_size, :]
    z = np.expand_dims(np.expand_dims(z, axis=2).reshape((batch_size, 32, 32)),
                       axis=1)
    y = _extract_genomes(TRUE_PATH)[:batch_size]
    y = [seq2id(x[target:target + INTERVAL_LEN]) for x in y]
    z = torch.FloatTensor(z)
    # print(z.size())
    y = torch.LongTensor(y)
    return z, y, None
def get_batch(target=0, batch_size=10):
    global CENTERS
    pca = PCA(n_components=2)
    batch = []
    for i in range(batch_size):
        batch.extend(gen(target))
    z = embedding([x for x, y in batch], "tmp", ARCH, cached=False)
    # to centralize the embeddings
    centers = []
    inner_cluster_dist = []
    y = [int(y) for x, y in batch]
    z = torch.FloatTensor(z)
    y = torch.LongTensor(y)
    return z, y, [x for x, y in batch]
Example #13
0
def get_batch(target = 0, batch_size = 10):
    global PLOTTED
    pca = PCA(n_components=2)
    batch = []
    for i in range(batch_size):
        batch.extend(gen(target))
    z = embedding([x for x, y in batch], "tmp", ARCH, cached = False)
    # to centralize the embeddings
    centers = []
    inner_cluster_dist = []
    # for i in range(z.shape[0]//4):
    #     c = np.mean(z[i*4:(i+1)*4, :], axis = 0)
    #     z[i*4:(i+1)*4] =  z[i*4:(i+1)*4] - c
    # A_vecs = []
    # for k in range(4):
    #     A_vecs.append(np.array([z[i, :] for i in range(z.shape[0]) if i % 4 == k]))
    # total = np.concatenate(A_vecs, axis = 0)
    # pca = MDS(n_components=2)
    # total = pca.fit_transform(total)
    # colors = sns.color_palette("hls", 4)
    # interval = len(total) // 4
    # if(not PLOTTED):
    #     for k in range(4):
    #         plt.scatter(total[k*interval:(k+1)*interval,0], total[k*interval:(k+1)*interval,1], c = [colors[k] for i in range(interval)])
    #     plt.savefig('delta_mds_center.png')
    #     PLOTTED = True
    
        # centers.append(c)
        # if(np.random.rand() < 0.1):
        #    CENTERS.append(c) # collect the centers
    # inner_cluster_dist = np.linalg.norm(z, axis = 0)
    # centers = np.array(centers)
    # dist = pdist(centers, 'euclidean')
    # print("OUTER: {}".format(describe(dist)))
    # print("INNER: {}".format(describe(inner_cluster_dist)))
    # for i in range(z.shape[0]):
    #     z[i, :] = z[i, :] - np.mean(z[i, :]) # what about the average
    y = [int(y) for x, y in batch]
    z = torch.FloatTensor(z)
    y = torch.LongTensor(y)
    return z, y, [x for x, y in batch]
Example #14
0
def use_DANN(key):
    # X_train, Y_train
    X_train, Y_train = [], []
    # get training dataset
    for i in [0, 1]:
        f = open(DS_PATH.format(key, i) + '.txt', 'r')
        sents = [x[:-1] for x in f if x[:-1] != '']
        embs = embedding(sents, DS_EMB_PATH.format(key, i), ARCH, key)
        embs = embs[np.random.choice(len(embs), 110, replace=False), :]
        X_train.append(embs)
        Y_train.extend([i] * embs.shape[0])
        f.close()
    X_train = np.concatenate(X_train, axis=0)
    Y_train = np.array(Y_train)

    # X_valid, Y_valid
    raw_valid, X_valid = list(open(
        TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH + '.' + ARCH + '.npy')
    X_valid_b = X_valid
    if (IS_BALANCED):
        raw_valid, X_valid = balance(key, raw_valid, X_valid)
    Y_valid = np.array([(key in x) for x in raw_valid])

    clf = DANN(input_size=EMB_DIM_TABLE[ARCH],
               maxiter=DANN_MAXITER,
               verbose=False,
               name=key,
               batch_size=DANN_BATCH_SIZE,
               lambda_adapt=DANN_LAMBDA,
               hidden_layer_size=DANN_HIDDEN)

    # How to chose X_adapt? X_valid(after/before balanced),
    acc = clf.fit(X_train,
                  Y_train,
                  X_adapt=X_valid,
                  X_valid=X_valid,
                  Y_valid=Y_valid)
    return acc
Example #15
0
def train_atk_classifier(key, size=1900):
    pca = None
    X_train, Y_train = [], []

    for i in [0, 1]:
        f = open(PATH.format(key, i), 'r')
        sents = [x[:-1] for x in f if x[:-1] != '']
        embs = embedding(sents, EMB_PATH.format(key, i), ARCH)
        if args.prefix != 'part':
            embs = embs[np.random.choice(len(embs), size, replace=False), :]
        X_train.append(embs)
        Y_train.extend([i] * embs.shape[0])
    X_train = np.concatenate(X_train, axis=0)
    Y_train = np.array(Y_train)
    train_embs = np.load(TRAIN_EMB_PATH)

    # BottleNeck
    # X_train = np.load(TRAIN_EMB_PATH)
    # raw_train = list(open(TRAIN_PATH, 'r'))
    # if IS_BALANCED:
    # raw_train, X_train = balance(key, raw_train, X_train)
    # Y_train = np.array([(key in x) for x in raw_train])

    # load validation set

    raw_valid, X_valid = list(open(TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH)
    if (key != 'potato' and IS_BALANCED):
        raw_valid, X_valid = balance(key, raw_valid, X_valid)
    print(len(raw_valid))
    Y_valid = np.array([(key in x) for x in raw_valid])
    acc = -1
    # learn a transfer

    # clf = linear_model.SGDClassifier(max_iter = 1000,  verbose = 0)
    # clf = SVC(kernel = 'rbf', gamma = 'scale', verbose = False)
    # clf = KNeighborsClassifier(n_neighbors=1, p = 1)
    if (NONLINEAR):
        # clf = DANN(input_size = EMB_DIM, maxiter = 2000, verbose = False, name = key, batch_size = 128)
        clf = DANN(input_size=EMB_DIM,
                   maxiter=4000,
                   verbose=True,
                   name=key,
                   batch_size=64,
                   lambda_adapt=1.0,
                   hidden_layer_size=25)
        acc = clf.fit(X_train,
                      Y_train,
                      X_adapt=train_embs,
                      X_valid=X_valid,
                      Y_valid=Y_valid)
        print("DANN Acc.: {:.4f}".format(acc))
    # train_embs = train_embs[np.random.choice(len(train_embs), 2000), :]

    # # apply pca first
    # if(DO_PCA):
    # train_embs = train_embs[np.random.choice(len(train_embs), size = 6 * int(len(X_train)), replace = False)]
    # package = np.concatenate([X_train, train_embs], axis = 0)
    # pca = PCA(n_components=INPUT_DIM)
    # pca.fit(package)
    # X_train, train_embs = pca.transform(X_train), pca.transform(train_embs)

    # if NONLINEAR:
    # clf = NonLinearClassifier(key, ARCH, cls_num = 2, pca = pca, use_pca = DO_PCA)

    # clf.fit(X_train, Y_train)

    if NONLINEAR:
        clf.to(torch.device('cpu'))
    # on current set
    # correct = 0
    if (VERBOSE):
        print("TRAIN INFERENCE MODEL FROM EXTERNAL SOURCES (# = {})".format(
            len(X_train)))
        correct = np.sum((clf.predict(X_train) == Y_train))
        print("Source Domain Acc.: {:.4f}".format(correct / len(Y_train)))
    return clf, pca, acc
tag |= set(data_set.train_fmale.tag.tag)

tag = list(tag)
dct = {tg: idx for idx, tg in enumerate(tag)}
train = []
label = []
ev_mat = getEmbeddingMat(50, len(tag))

ids = set(data_set.train_male.tag.id)
ref = data_set.train_male.tag
for i in ids:
    tg = ref[ref.id == i].tag
    seq = [0 for i in range(len(tag))]
    for t in tg:
        seq[dct[t]] = 1
    seq = embedding(numpy.array(seq), ev_mat)
    label.append('1')
    train.append(seq)
with open('save/train_male.txt', 'w') as f:
    for seq in train:
        s = ''
        for i in seq:
            s += str(i) + ' '
        s += '\n'
        f.write(s)

ids = set(data_set.train_fmale.tag.id)
ref = data_set.train_fmale.tag
for i in ids:
    tg = ref[ref.id == i].tag
    seq = [0 for i in range(len(tag))]
Example #17
0
                                  1):  # print every 2000 mini-batches
                print('[%d, %5d] loss: %.5f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
        # do evaluation
        acc = evaluate(test_loader, linear_classifier)
        # save the model
        torch.save(linear_classifier.state_dict(),
                   MODEL_SAVE_PATH.format(ARCH, MODEL))
        print("After Epoch {}, {} Test Accuracy {:.3f}% ".format(
            epoch + 1, ARCH, acc))
    print('Finished Training. Saving Model...')
    torch.save(linear_classifier.state_dict(),
               MODEL_SAVE_PATH.format(ARCH, MODEL))


if __name__ == '__main__':
    # clean_raw_data(PATH, DUMP_PATH)
    # class_stat(DUMP_PATH)
    # prepare_dataset(DUMP_PATH)
    # for split in ["train", "test"]:
    #     obtain_bert_embeddings(DS_PATH, EMB_PATH, split)
    # main()
    PATH = "data/medical.{}.txt"
    name = "data/medical.{}.x"
    for split in ["train", "test"]:
        sents = list(open(PATH.format(split), 'r'))
        types = ["bert", "gpt", "gpt2", "xl"]
        for t in types:
            embedding(sents, name.format(split), t, False)
Example #18
0
def main(args):
    train_dataset, train_dataloader, validate_dataset, validate_dataloader, test_dataset, test_dataloader = baseline.load_datasets_and_dataloaders()

    embedding_matrix = util.embedding(train_dataset.text_vocab, util.config["glove_file_path"])
    use_freeze = util.config["glove_file_path"] is not None
    embedding = torch.nn.Embedding.from_pretrained(embedding_matrix, padding_idx=0, freeze=use_freeze)
Example #19
0
def best_cell_and_baseline_with_and_without_pretrained(train_dataset, train_dataloader, validate_dataloader,
                                                       test_dataloader, clip):
    configs = []

    RNN_config = {}
    RNN_config["model"] = "LSTM"
    RNN_config["hidden_size"] = 30
    RNN_config["num_layers"] = 3
    RNN_config["dropout"] = 0.9
    RNN_config["bidirectional"] = True
    RNN_config["fc1_width"] = "//"
    RNN_config["fc2_width"] = "//"

    baseline_config = {}
    baseline_config["model"] = "Baseline"
    baseline_config["hidden_size"] = "//"
    baseline_config["num_layers"] = "//"
    baseline_config["dropout"] = "//"
    baseline_config["bidirectional"] = "//"
    baseline_config["fc1_width"] = 150
    baseline_config["fc2_width"] = 150

    initial_config = {}
    initial_config["clip"] = args.clip
    initial_config["epochs"] = args.epochs
    initial_config["input_width"] = 300
    initial_config["output_width"] = 1

    lstm = RNN.RecurrentModel(RNN_config["model"], initial_config["input_width"], RNN_config["hidden_size"],
                              initial_config["output_width"], RNN_config["num_layers"],
                              RNN_config["bidirectional"], RNN_config["dropout"])

    base = BaselineModel(initial_config["input_width"], baseline_config["fc1_width"], baseline_config["fc2_width"],
                         initial_config["output_width"])
    models = [base, lstm]

    criterion = nn.BCEWithLogitsLoss()
    use_embeddings = [False, True]
    for use_embedding in use_embeddings:
        if use_embedding:
            file_path = util.config["glove_file_path"]
        else:
            file_path = None
        embedding_matrix = util.embedding(train_dataset.text_vocab, file_path)
        use_freeze = util.config["glove_file_path"] is not None
        embedding = torch.nn.Embedding.from_pretrained(embedding_matrix, padding_idx=0, freeze=use_freeze)

        for model in models:
            start = time.time()
            config = {}

            if type(model) == RNN.RecurrentModel:
                config.update(RNN_config)
                train = RNN.train
                evaluate = RNN.evaluate
            else:
                config.update(baseline_config)
                train = baseline.train
                evaluate = baseline.evaluate

            config.update(initial_config)
            config["pretrained"] = use_embedding

            print(config)

            optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

            for epoch in range(args.epochs):
                print(f'----------------------------\nEpoch: {epoch}')
                train(model, train_dataloader, optimizer, criterion, embedding, clip.clip)
                evaluate(model, validate_dataloader, criterion, embedding)
            accuracy, f1, confusion_matrix = evaluate(model, test_dataloader, criterion, embedding)
            config["accuracy"] = accuracy.item()
            config["f1"] = f1.item()
            config["TP"] = confusion_matrix[0, 0].item()
            config["FP"] = confusion_matrix[0, 1].item()
            config["FN"] = confusion_matrix[1, 0].item()
            config["TN"] = confusion_matrix[1, 1].item()

            end = time.time()
            config["time"] = end - start
            configs.append(config)

    print_to_file("4a_pretrained.xls", "Best RNN and baseline", configs)