Exemple #1
0
def get_validate_phrases(args):
    pairs = []
    phrases = []
    for filename in ["valid.txt"]:
        with codecs.open(os.path.join(args.data_dir, filename),
                         encoding="utf-8") as f:
            f.readline()
            for line in f:
                parts = line.strip().split("\t")
                pair = {
                    "text_1": parts[3],
                    "text_2": parts[4],
                    "decision": float(parts[0])
                }
                pairs.append(pair)
    pairs = pairs
    true = [x["decision"] for x in pairs]
    with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'rb') as f:
        chars, _ = cPickle.load(f)

    for pair in pairs:
        phrases.append(noise_generator(pair["text_1"], args.noise_level,
                                       chars))
        phrases.append(noise_generator(pair["text_2"], args.noise_level,
                                       chars))

    return phrases, true
Exemple #2
0
def get_validate_entailment(args):
    pairs = []
    phrases = []
    import pandas as pd
    valid_path = os.path.join(args.data_dir, "valid.txt")
    if valid_path.__contains__("quora"):
        full_df = pd.read_csv(valid_path, sep='\t')[:300]
        decision = "duplicate"
    else:
        decision = "gold_label"
        full_df = pd.read_csv(valid_path)
    for index, row in full_df.iterrows():
        pair = {
            "text_1": row['sentence1'],
            "text_2": row["sentence2"],
            "decision": int(row[decision])
        }  #ist(filter(lambda x: x.isdigit(), row["gold_label"]))[0]
        pairs.append(pair)

    pairs = pairs
    true = [x["decision"] for x in pairs]
    with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'rb') as f:
        chars, _ = cPickle.load(f)

    for pair in pairs:
        phrases.append(noise_generator(pair["text_1"], args.noise_level,
                                       chars))
        phrases.append(noise_generator(pair["text_2"], args.noise_level,
                                       chars))

    return phrases, true
def svm_robust_score(args, data, labels):
    idx_for_split = int(0.2 * len(data))
    phrases = []
    pred = []
    for index, row in data.iterrows():
        phrases.append(
            noise_generator(row["sentence1"], args.noise_level, chars))
        phrases.append(
            noise_generator(row["sentence2"], args.noise_level, chars))
    from sample import sample_multi
    results = np.squeeze(
        np.vsplit(sample_multi(args.save_dir, phrases, args.model_type),
                  len(phrases)))
    for i in range(0, len(results), 2):
        v1 = results[i]
        v2 = results[i + 1]
        if (v1 == np.zeros_like(v1)).all() or (v2 == np.zeros_like(v2)).all():
            print(i)
        pred.append(1 - cosine(v1, v2))
        if math.isnan(pred[-1]):
            pred[-1] = 0.5
    pr = pd.DataFrame(pred)
    train = pr.iloc[idx_for_split:]
    test = pr.iloc[:idx_for_split]
    train_label = labels[idx_for_split:]
    test_label = labels[:idx_for_split]
    roc_auc = linear_svm(train, test, train_label, test_label)

    with open("results_entail_" + args.model_type + ".txt", "at") as f_out:
        # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred)))
        f_out.write(args.mode + ",%.2f,%.3f\n" % (args.noise_level, roc_auc))
def get_robust_score(args, pairs, true):
    if "robust" in args.mode:
        pred = []
        phrases = []
        for index, row in pairs.iterrows():
            phrases.append(
                noise_generator(row["sentence1"], args.noise_level, chars))
            phrases.append(
                noise_generator(row["sentence2"], args.noise_level, chars))
        from sample import sample_multi
        results = np.vsplit(
            sample_multi(args.save_dir, phrases, args.model_type),
            len(phrases))
        for i in range(0, len(results), 2):
            v1 = results[i]
            v2 = results[i + 1]
            if (v1 == np.zeros_like(v1)).all() or (v2
                                                   == np.zeros_like(v2)).all():
                print(i)
            pred.append(1 - cosine(v1, v2))
            if math.isnan(pred[-1]):
                pred[-1] = 0.5
        with open("results_entail" + args.model_type + ".txt", "at") as f_out:
            # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred)))
            f_out.write(args.mode + ",%.2f,%.3f\n" %
                        (args.noise_level, roc_auc_score(true, pred)))
def svm_robust_score(args,data, labels):
    idx_for_split = int(0.2 * len(data))
    phrases = []
    pred = []
    for index, row in data.iterrows():
        phrases.append(noise_generator(row["sentence1"], args.noise_level, chars))
        phrases.append(noise_generator(row["sentence2"], args.noise_level, chars))
    from sample import sample_multi
    results = np.squeeze(np.vsplit(sample_multi(args.save_dir, phrases, args.model_type), len(phrases)))
    #pairs_vectors = zip(results[0::2], results[1::2])
    # df = pd.DataFrame(columns=['cosine', 'canberra', 'cityblock', 'euclidean', 'minkowski', 'braycurtis',"skew_q1","skew_q2"\
    #                            "kur_q1","kur_q2", "skew_diff", "kur_diff"])
    # df['cosine'] = [cosine(x, y) for (x, y) in pairs_vectors]
    # print(len(df))
    # df['canberra'] = [canberra(x, y) for (x, y) in zip(results[0::2], results[1::2])]
    # df['cityblock'] = [cityblock(x, y) for (x, y) in zip(results[0::2], results[1::2])]
    # df['euclidean'] = [euclidean(x, y) for (x, y) in zip(results[0::2], results[1::2])]
    # df['minkowski'] = [minkowski(x, y, 3) for (x, y) in zip(results[0::2], results[1::2])]
    # df['braycurtis'] = [braycurtis(x, y) for (x, y) in zip(results[0::2], results[1::2])]
    # question1_vec = results[0::2]
    # question2_vec = results[1::2]
    # data['skew_q1'] = [skew(x) for x in question1_vec]
    # data['skew_q2'] = [skew(x) for x in question2_vec]
    # data['kur_q1'] = [kurtosis(x) for x in question1_vec]
    # data['kur_q2'] = [kurtosis(x) for x in question2_vec]
    #
    # data['skew_diff'] = np.abs(data['skew_q1'] - data['skew_q2'])
    # data['kur_diff'] = np.abs(data['kur_q1'] - data['kur_q2'])

    for i in range(0, len(results), 2):
        v1 = results[i]
        v2 = results[i + 1]
        if (v1 == np.zeros_like(v1)).all() or (v2 == np.zeros_like(v2)).all():
            print(i)
        pred.append(1 - cosine(v1, v2))
        if math.isnan(pred[-1]):
            pred[-1] = 0.5
    # pr = pd.DataFrame(pred)
    # train = df.iloc[idx_for_split:]
    # test = df.iloc[:idx_for_split]
    # train_label = labels[idx_for_split:]
    # test_label = labels[:idx_for_split]
    roc_auc= roc_auc_score(labels, pred)
    # clf = catboost.CatBoostClassifier(depth=6, iterations=5000, learning_rate=0.1, thread_count=16)
    # clf.fit(train, train_label)
    # y_proba = clf.predict_proba(test)[:, 1]
    # roc_auc = roc_auc_score(test_label, y_proba)
    with open("results_quora" + args.model_type + ".txt", "at") as f_out:
        # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred)))
        f_out.write(args.mode + ",%.2f,%.3f\n" % (args.noise_level, roc_auc))
def generator_loss(generator, discriminator, criterion, num_images, z_dim):
    noise = noise_generator(num_images, z_dim)
    gen_out = generator(noise)
    # discriminator output on fake images generated by generator
    gen_fake_out = discriminator(gen_out)
    # calculating loss on fake images as generator wants discrimiator to treat theses images as real(1)
    gen_fake_loss = criterion(gen_fake_out, torch.ones_like(gen_fake_out))

    return gen_fake_loss
def get_w2v_results(args, pairs, true):
    pred = []
    w2v = Word2Vec.load(args.word2vec_model)

    def get_mean_vec(phrase):
        tokens = word_tokenize(phrase)
        vectors = [np.zeros((w2v.vector_size,))]
        for token in tokens:
            if token in w2v:
                vector = w2v[token]
                vectors.append(vector)
        return np.mean(vectors, axis=0)

    for index, pair in pairs.iterrows():
        v1 = get_mean_vec(noise_generator(pair["sentence1"], args.noise_level, chars))
        v2 = get_mean_vec(noise_generator(pair["sentence2"], args.noise_level, chars))
        pred.append(1 - cosine(v1, v2))
    with open("results" + args.mode + ".txt", "at") as f_out:
        # f_out.write("word2vec,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred)))
        f_out.write("word2vec,%.2f,%.3f\n" % (args.noise_level, roc_auc_score(true, pred)))
def discriminator_loss(generator, discriminator, criterion, real, num_images,
                       z_dim):
    noise = noise_generator(num_images, z_dim)
    gen_out = generator(noise)
    # discriminator output on fake images generated by generator
    disc_fake_out = discriminator(gen_out.detach())
    # calculating loss on fake images as discrimiator wants theses images to be fake(0)
    disc_fake_loss = criterion(disc_fake_out, torch.zeros_like(disc_fake_out))

    #discriminator output on real images
    disc_real_out = discriminator(real)
    # calculating loss on real images as discrimiator wants theses images to be real(1)
    disc_real_loss = criterion(disc_real_out, torch.ones_like(disc_real_out))

    #averaging both loss
    disc_loss = (disc_real_loss + disc_fake_loss) / 2

    return disc_loss
if "word2vec" in args.mode:
    pred = []
    w2v = Word2Vec.load_word2vec_format(args.word2vec_model, binary=True)

    def get_mean_vec(phrase):
        tokens = word_tokenize(phrase)
        vectors = [np.zeros((w2v.vector_size,))]
        for token in tokens:
            if token in w2v:
                vector = w2v[token]
                vectors.append(vector)
        return np.mean(vectors, axis=0)

    for pair in tqdm(pairs):
        v1 = get_mean_vec(noise_generator(pair["text_1"], args.noise_level, chars))
        v2 = get_mean_vec(noise_generator(pair["text_2"], args.noise_level, chars))
        pred.append(1 - cosine(v1, v2))
    with open("results_w2v_MRPC.txt", "at") as f_out:
        f_out.write("word2vec,%.2f,%.3f\n" % (args.noise_level, roc_auc_score(true, pred)))

if "robust" in args.mode:
    pred = []
    phrases = []
    for pair in pairs:
        phrases.append(noise_generator(pair["text_1"], args.noise_level, chars))
        phrases.append(noise_generator(pair["text_2"], args.noise_level, chars))
    from sample import sample_multi
    results = np.vsplit(sample_multi(args.save_dir, phrases, args.model_type), len(phrases))
    for i in range(0, len(results), 2):
        v1 = results[i]
def train(n_epochs=200, batch_size=128, lr=0.00001, z_dim=64, hidden_dim=128):
    cur_step = 0
    display_step = 500
    criterion = nn.BCEWithLogitsLoss()
    mean_discriminator_loss = 0
    mean_generator_loss = 0
    # generator model with optimizer
    generator = Generator(z_dim=z_dim, img_dim=784,
                          hidden_dim=hidden_dim).to(device)
    generator_opt = torch.optim.Adam(generator.parameters(), lr=lr)

    # discriminator model with optimizer
    discriminator = Discriminator(img_dim=784,
                                  hidden_dim=hidden_dim).to(device)
    discriminator_opt = torch.optim.Adam(discriminator.parameters(), lr=lr)

    dataloader = get_dataloader(batch_size)

    for epoch in range(n_epochs):
        for real, _ in dataloader:
            cur_batch_size = len(real)

            # Flatten the batch of real images from the dataset
            real = real.view(cur_batch_size, -1).to(device)

            # Zero out the gradients before backpropagation
            discriminator_opt.zero_grad()

            # Calculate discriminator loss
            disc_loss = discriminator_loss(generator, discriminator, criterion,
                                           real, cur_batch_size, z_dim)

            # Update gradients
            disc_loss.backward(retain_graph=True)

            # Update optimizer
            discriminator_opt.step()

            # Zero out the gradients before backpropagation
            generator_opt.zero_grad()

            # Calculate generator loss
            gen_loss = generator_loss(generator, discriminator, criterion,
                                      cur_batch_size, z_dim)

            # Update gradients
            gen_loss.backward(retain_graph=True)

            # Update optimizer
            generator_opt.step()

            # Keep track of the average discriminator loss
            mean_discriminator_loss += disc_loss.item() / display_step

            # Keep track of the average generator loss
            mean_generator_loss += gen_loss.item() / display_step

            if cur_step % display_step == 0 and cur_step > 0:
                print(
                    f"Epoch {epoch}, step {cur_step}: Generator loss: {mean_generator_loss}, discriminator loss: {mean_discriminator_loss}"
                )
                fake_noise = noise_generator(cur_batch_size, z_dim)
                fake = generator(fake_noise)
                generated_image_grid = show_tensor_images(
                    fake, tensorboard_writer=True)
                tensorboard_writer(generated_image_grid,
                                   epoch,
                                   cur_step,
                                   gen_image=True)
                real_image_grid = show_tensor_images(real,
                                                     tensorboard_writer=True)
                tensorboard_writer(real_image_grid,
                                   epoch,
                                   cur_step,
                                   gen_image=False)
                mean_generator_loss = 0
                mean_discriminator_loss = 0
            cur_step += 1