def get_validate_phrases(args): pairs = [] phrases = [] for filename in ["valid.txt"]: with codecs.open(os.path.join(args.data_dir, filename), encoding="utf-8") as f: f.readline() for line in f: parts = line.strip().split("\t") pair = { "text_1": parts[3], "text_2": parts[4], "decision": float(parts[0]) } pairs.append(pair) pairs = pairs true = [x["decision"] for x in pairs] with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'rb') as f: chars, _ = cPickle.load(f) for pair in pairs: phrases.append(noise_generator(pair["text_1"], args.noise_level, chars)) phrases.append(noise_generator(pair["text_2"], args.noise_level, chars)) return phrases, true
def get_validate_entailment(args): pairs = [] phrases = [] import pandas as pd valid_path = os.path.join(args.data_dir, "valid.txt") if valid_path.__contains__("quora"): full_df = pd.read_csv(valid_path, sep='\t')[:300] decision = "duplicate" else: decision = "gold_label" full_df = pd.read_csv(valid_path) for index, row in full_df.iterrows(): pair = { "text_1": row['sentence1'], "text_2": row["sentence2"], "decision": int(row[decision]) } #ist(filter(lambda x: x.isdigit(), row["gold_label"]))[0] pairs.append(pair) pairs = pairs true = [x["decision"] for x in pairs] with open(os.path.join(args.save_dir, 'chars_vocab.pkl'), 'rb') as f: chars, _ = cPickle.load(f) for pair in pairs: phrases.append(noise_generator(pair["text_1"], args.noise_level, chars)) phrases.append(noise_generator(pair["text_2"], args.noise_level, chars)) return phrases, true
def svm_robust_score(args, data, labels): idx_for_split = int(0.2 * len(data)) phrases = [] pred = [] for index, row in data.iterrows(): phrases.append( noise_generator(row["sentence1"], args.noise_level, chars)) phrases.append( noise_generator(row["sentence2"], args.noise_level, chars)) from sample import sample_multi results = np.squeeze( np.vsplit(sample_multi(args.save_dir, phrases, args.model_type), len(phrases))) for i in range(0, len(results), 2): v1 = results[i] v2 = results[i + 1] if (v1 == np.zeros_like(v1)).all() or (v2 == np.zeros_like(v2)).all(): print(i) pred.append(1 - cosine(v1, v2)) if math.isnan(pred[-1]): pred[-1] = 0.5 pr = pd.DataFrame(pred) train = pr.iloc[idx_for_split:] test = pr.iloc[:idx_for_split] train_label = labels[idx_for_split:] test_label = labels[:idx_for_split] roc_auc = linear_svm(train, test, train_label, test_label) with open("results_entail_" + args.model_type + ".txt", "at") as f_out: # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred))) f_out.write(args.mode + ",%.2f,%.3f\n" % (args.noise_level, roc_auc))
def get_robust_score(args, pairs, true): if "robust" in args.mode: pred = [] phrases = [] for index, row in pairs.iterrows(): phrases.append( noise_generator(row["sentence1"], args.noise_level, chars)) phrases.append( noise_generator(row["sentence2"], args.noise_level, chars)) from sample import sample_multi results = np.vsplit( sample_multi(args.save_dir, phrases, args.model_type), len(phrases)) for i in range(0, len(results), 2): v1 = results[i] v2 = results[i + 1] if (v1 == np.zeros_like(v1)).all() or (v2 == np.zeros_like(v2)).all(): print(i) pred.append(1 - cosine(v1, v2)) if math.isnan(pred[-1]): pred[-1] = 0.5 with open("results_entail" + args.model_type + ".txt", "at") as f_out: # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred))) f_out.write(args.mode + ",%.2f,%.3f\n" % (args.noise_level, roc_auc_score(true, pred)))
def svm_robust_score(args,data, labels): idx_for_split = int(0.2 * len(data)) phrases = [] pred = [] for index, row in data.iterrows(): phrases.append(noise_generator(row["sentence1"], args.noise_level, chars)) phrases.append(noise_generator(row["sentence2"], args.noise_level, chars)) from sample import sample_multi results = np.squeeze(np.vsplit(sample_multi(args.save_dir, phrases, args.model_type), len(phrases))) #pairs_vectors = zip(results[0::2], results[1::2]) # df = pd.DataFrame(columns=['cosine', 'canberra', 'cityblock', 'euclidean', 'minkowski', 'braycurtis',"skew_q1","skew_q2"\ # "kur_q1","kur_q2", "skew_diff", "kur_diff"]) # df['cosine'] = [cosine(x, y) for (x, y) in pairs_vectors] # print(len(df)) # df['canberra'] = [canberra(x, y) for (x, y) in zip(results[0::2], results[1::2])] # df['cityblock'] = [cityblock(x, y) for (x, y) in zip(results[0::2], results[1::2])] # df['euclidean'] = [euclidean(x, y) for (x, y) in zip(results[0::2], results[1::2])] # df['minkowski'] = [minkowski(x, y, 3) for (x, y) in zip(results[0::2], results[1::2])] # df['braycurtis'] = [braycurtis(x, y) for (x, y) in zip(results[0::2], results[1::2])] # question1_vec = results[0::2] # question2_vec = results[1::2] # data['skew_q1'] = [skew(x) for x in question1_vec] # data['skew_q2'] = [skew(x) for x in question2_vec] # data['kur_q1'] = [kurtosis(x) for x in question1_vec] # data['kur_q2'] = [kurtosis(x) for x in question2_vec] # # data['skew_diff'] = np.abs(data['skew_q1'] - data['skew_q2']) # data['kur_diff'] = np.abs(data['kur_q1'] - data['kur_q2']) for i in range(0, len(results), 2): v1 = results[i] v2 = results[i + 1] if (v1 == np.zeros_like(v1)).all() or (v2 == np.zeros_like(v2)).all(): print(i) pred.append(1 - cosine(v1, v2)) if math.isnan(pred[-1]): pred[-1] = 0.5 # pr = pd.DataFrame(pred) # train = df.iloc[idx_for_split:] # test = df.iloc[:idx_for_split] # train_label = labels[idx_for_split:] # test_label = labels[:idx_for_split] roc_auc= roc_auc_score(labels, pred) # clf = catboost.CatBoostClassifier(depth=6, iterations=5000, learning_rate=0.1, thread_count=16) # clf.fit(train, train_label) # y_proba = clf.predict_proba(test)[:, 1] # roc_auc = roc_auc_score(test_label, y_proba) with open("results_quora" + args.model_type + ".txt", "at") as f_out: # f_out.write("robust,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred))) f_out.write(args.mode + ",%.2f,%.3f\n" % (args.noise_level, roc_auc))
def generator_loss(generator, discriminator, criterion, num_images, z_dim): noise = noise_generator(num_images, z_dim) gen_out = generator(noise) # discriminator output on fake images generated by generator gen_fake_out = discriminator(gen_out) # calculating loss on fake images as generator wants discrimiator to treat theses images as real(1) gen_fake_loss = criterion(gen_fake_out, torch.ones_like(gen_fake_out)) return gen_fake_loss
def get_w2v_results(args, pairs, true): pred = [] w2v = Word2Vec.load(args.word2vec_model) def get_mean_vec(phrase): tokens = word_tokenize(phrase) vectors = [np.zeros((w2v.vector_size,))] for token in tokens: if token in w2v: vector = w2v[token] vectors.append(vector) return np.mean(vectors, axis=0) for index, pair in pairs.iterrows(): v1 = get_mean_vec(noise_generator(pair["sentence1"], args.noise_level, chars)) v2 = get_mean_vec(noise_generator(pair["sentence2"], args.noise_level, chars)) pred.append(1 - cosine(v1, v2)) with open("results" + args.mode + ".txt", "at") as f_out: # f_out.write("word2vec,%.2f,%.3f\n" % (args.noise_level, mean_squared_error(true, pred))) f_out.write("word2vec,%.2f,%.3f\n" % (args.noise_level, roc_auc_score(true, pred)))
def discriminator_loss(generator, discriminator, criterion, real, num_images, z_dim): noise = noise_generator(num_images, z_dim) gen_out = generator(noise) # discriminator output on fake images generated by generator disc_fake_out = discriminator(gen_out.detach()) # calculating loss on fake images as discrimiator wants theses images to be fake(0) disc_fake_loss = criterion(disc_fake_out, torch.zeros_like(disc_fake_out)) #discriminator output on real images disc_real_out = discriminator(real) # calculating loss on real images as discrimiator wants theses images to be real(1) disc_real_loss = criterion(disc_real_out, torch.ones_like(disc_real_out)) #averaging both loss disc_loss = (disc_real_loss + disc_fake_loss) / 2 return disc_loss
if "word2vec" in args.mode: pred = [] w2v = Word2Vec.load_word2vec_format(args.word2vec_model, binary=True) def get_mean_vec(phrase): tokens = word_tokenize(phrase) vectors = [np.zeros((w2v.vector_size,))] for token in tokens: if token in w2v: vector = w2v[token] vectors.append(vector) return np.mean(vectors, axis=0) for pair in tqdm(pairs): v1 = get_mean_vec(noise_generator(pair["text_1"], args.noise_level, chars)) v2 = get_mean_vec(noise_generator(pair["text_2"], args.noise_level, chars)) pred.append(1 - cosine(v1, v2)) with open("results_w2v_MRPC.txt", "at") as f_out: f_out.write("word2vec,%.2f,%.3f\n" % (args.noise_level, roc_auc_score(true, pred))) if "robust" in args.mode: pred = [] phrases = [] for pair in pairs: phrases.append(noise_generator(pair["text_1"], args.noise_level, chars)) phrases.append(noise_generator(pair["text_2"], args.noise_level, chars)) from sample import sample_multi results = np.vsplit(sample_multi(args.save_dir, phrases, args.model_type), len(phrases)) for i in range(0, len(results), 2): v1 = results[i]
def train(n_epochs=200, batch_size=128, lr=0.00001, z_dim=64, hidden_dim=128): cur_step = 0 display_step = 500 criterion = nn.BCEWithLogitsLoss() mean_discriminator_loss = 0 mean_generator_loss = 0 # generator model with optimizer generator = Generator(z_dim=z_dim, img_dim=784, hidden_dim=hidden_dim).to(device) generator_opt = torch.optim.Adam(generator.parameters(), lr=lr) # discriminator model with optimizer discriminator = Discriminator(img_dim=784, hidden_dim=hidden_dim).to(device) discriminator_opt = torch.optim.Adam(discriminator.parameters(), lr=lr) dataloader = get_dataloader(batch_size) for epoch in range(n_epochs): for real, _ in dataloader: cur_batch_size = len(real) # Flatten the batch of real images from the dataset real = real.view(cur_batch_size, -1).to(device) # Zero out the gradients before backpropagation discriminator_opt.zero_grad() # Calculate discriminator loss disc_loss = discriminator_loss(generator, discriminator, criterion, real, cur_batch_size, z_dim) # Update gradients disc_loss.backward(retain_graph=True) # Update optimizer discriminator_opt.step() # Zero out the gradients before backpropagation generator_opt.zero_grad() # Calculate generator loss gen_loss = generator_loss(generator, discriminator, criterion, cur_batch_size, z_dim) # Update gradients gen_loss.backward(retain_graph=True) # Update optimizer generator_opt.step() # Keep track of the average discriminator loss mean_discriminator_loss += disc_loss.item() / display_step # Keep track of the average generator loss mean_generator_loss += gen_loss.item() / display_step if cur_step % display_step == 0 and cur_step > 0: print( f"Epoch {epoch}, step {cur_step}: Generator loss: {mean_generator_loss}, discriminator loss: {mean_discriminator_loss}" ) fake_noise = noise_generator(cur_batch_size, z_dim) fake = generator(fake_noise) generated_image_grid = show_tensor_images( fake, tensorboard_writer=True) tensorboard_writer(generated_image_grid, epoch, cur_step, gen_image=True) real_image_grid = show_tensor_images(real, tensorboard_writer=True) tensorboard_writer(real_image_grid, epoch, cur_step, gen_image=False) mean_generator_loss = 0 mean_discriminator_loss = 0 cur_step += 1