def main(args): data_pth = "results/%s" % args.data_name train_pth = os.path.join(data_pth, ("train_identical_{}_{}.txt").format(str(args.confidence+10),args.style)) #dev_pth = os.path.join(data_pth, "dev_identical_80_%s.txt" % args.style) test_pth = os.path.join(data_pth, ("test_identical_{}_{}.txt").format(str(args.confidence+10),args.style)) train_data = MonoTextData(train_pth, True, vocab=100000) #random.shuffle(train_data.data) vocab = train_data.vocab #dev_data = MonoTextData(dev_pth, True, vocab=vocab) #random.shuffle(dev_data.data) test_data = MonoTextData(test_pth, True, vocab=vocab) path = "checkpoint/{}-identical-{}-{}-classifier.pt".format(str(args.confidence),args.data_name,args.style) #path = "checkpoint/%s-classifier.pt" % args.data_name device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #train_batch, train_label = train_data.create_data_batch_labels(64, device, batch_first=True) #dev_batch, dev_label = dev_data.create_data_batch_labels(64, device, batch_first=True) test_batch, test_label = test_data.create_data_batch_labels(64, device, batch_first=True) #nbatch = len(train_batch) #best_acc = 0.0 #step = 0 checkpoint = torch.load(path) model = CNNClassifier(len(checkpoint['embedding.weight']), 300, [1,2,3,4,5], 500, 0.5).to(device) model.load_state_dict(checkpoint) model.eval() with torch.no_grad(): acc = evaluate(model, test_batch, test_label) print('Test Acc: %.2f' % acc)
def main(args): conf = config.CONFIG[args.data_name] data_pth = "data/%s" % args.data_name train_data_pth = os.path.join(data_pth, "train_data.txt") train_data = MonoTextData(train_data_pth, True) vocab = train_data.vocab print('Vocabulary size: %d' % len(vocab)) dev_data_pth = os.path.join(data_pth, "dev_data.txt") dev_data = MonoTextData(dev_data_pth, True, vocab=vocab) test_data_pth = os.path.join(data_pth, "test_data.txt") test_data = MonoTextData(test_data_pth, True, vocab=vocab) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") save_path = '{}-{}'.format(args.save, args.data_name) save_path = os.path.join(save_path, time.strftime("%Y%m%d-%H%M%S")) scripts_to_save = [ 'run.py', 'models/aggressive_vae.py', 'models/vae.py', 'models/base_network.py', 'config.py' ] logging = create_exp_dir(save_path, scripts_to_save=scripts_to_save, debug=args.debug) train = train_data.create_data_batch(args.bsz, device) dev = dev_data.create_data_batch(args.bsz, device) test = test_data.create_data_batch(args.bsz, device) kwargs = { "train": train, "valid": dev, "test": test, "bsz": args.bsz, "save_path": save_path, "logging": logging, } params = conf["params"] params["vae_params"]["vocab"] = vocab params["vae_params"]["device"] = device kwargs = dict(kwargs, **params) model = AgressiveVAE(**kwargs) try: valid_loss = model.fit() logging("val loss : {}".format(valid_loss)) except KeyboardInterrupt: logging("Exiting from training early") model.load(save_path) test_loss = model.evaluate(model.test_data) logging("test loss: {}".format(test_loss[0])) logging("test recon: {}".format(test_loss[1])) logging("test kl: {}".format(test_loss[2])) logging("test mi: {}".format(test_loss[3]))
def main(args): data_pth = "data/%s" % args.data_name train_pth = os.path.join(data_pth, "train_data.txt") train_data = MonoTextData(train_pth, True, vocab=100000) vocab = train_data.vocab source_pth = os.path.join(data_pth, "test_data.txt") target_pth = args.target_path eval_data = MonoTextData(target_pth, True, vocab=vocab) source = pd.read_csv(source_pth, names=['label', 'content'], sep='\t') target = pd.read_csv(target_pth, names=['label', 'content'], sep='\t') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Classification Accuracy model = CNNClassifier(len(vocab), 300, [1, 2, 3, 4, 5], 500, 0.5).to(device) model.load_state_dict( torch.load("checkpoint/%s-classifier.pt" % args.data_name)) model.eval() eval_data, eval_label = eval_data.create_data_batch_labels( 64, device, batch_first=True) acc = 100 * evaluate(model, eval_data, eval_label) print("Acc: %.2f" % acc) # BLEU Score total_bleu = 0.0 sources = [] targets = [] for i in range(source.shape[0]): s = source.content[i].split() t = target.content[i].split() sources.append([s]) targets.append(t) total_bleu += compute_bleu(sources, targets)[0] total_bleu *= 100 print("Bleu: %.2f" % total_bleu)
def main(args): conf = config.CONFIG[args.data_name] data_pth = "data/%s" % args.data_name train_data_pth = os.path.join(data_pth, "train_data.txt") train_data = MonoTextData(train_data_pth, True) vocab = train_data.vocab dev_data_pth = os.path.join(data_pth, "dev_data.txt") dev_data = MonoTextData(dev_data_pth, True, vocab=vocab) test_data_pth = os.path.join(data_pth, "test_data.txt") test_data = MonoTextData(test_data_pth, True, vocab=vocab) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") kwargs = { "train": [1], "valid": None, "test": None, "bsz": 32, "save_path": args.load_path, "logging": None, } params = conf["params"] params["vae_params"]["vocab"] = vocab params["vae_params"]["device"] = device kwargs = dict(kwargs, **params) model = AgressiveVAE(**kwargs) model.load(args.load_path) model.vae.eval() train = train_data.create_data_batch(32, device) dev, dev_labels = dev_data.create_data_batch_labels(64, device) dev_labels = [x for sublist in dev_labels for x in sublist] print("Collecting training distributions...") mus, logvars = [], [] step = 0 for batch_data in train: mu, logvar = model.vae.encoder(batch_data) mus.append(mu.detach().cpu()) logvars.append(logvar.detach().cpu()) step += 1 if step % 100 == 0: torch.cuda.empty_cache() mus = torch.cat(mus, 0) logvars = torch.cat(logvars, 0) zs = [] for batch_data in dev: z, _ = model.vae.encoder(batch_data) zs.append(z) zs = torch.cat(zs, 0) mu = zs.mean(dim=0, keepdim=True) # unnormalized_zs = zs.data.cpu().numpy() zs = (zs - mu).data.cpu().numpy() def sigmoid(x): return 1.0 / (1.0 + np.exp(-x)) best_acc = 0.0 best_idx = -1 other_idx = 64 sign = 1 for i in range(zs.shape[1]): correct_num = 0 for j in range(zs.shape[0]): logit = sigmoid(-zs[j, i]) if np.abs(dev_labels[j] - logit) < 0.5: correct_num += 1 acc = correct_num / zs.shape[0] if acc > best_acc: best_acc = acc best_idx = i sign = 1 if 1 - acc > best_acc: best_acc = 1 - acc best_idx = i sign = 0 print(best_acc, best_idx) v = mus[:, best_idx] mu = v.mean() std = v.std() if args.type == 3: max_v = max(v) min_v = min(v) else: max_v = mu + args.type * std min_v = mu - args.type * std sep_id = -1 for idx, x in enumerate(test_data.labels): if x == 1: sep_id = idx break bsz = 64 ori_logps = [] tra_logps = [] with open( os.path.join(args.load_path, 'generated_text_%d.txt' % args.type), "w") as f: idx = 0 step = 0 n_samples = len(test_data.labels) while idx < n_samples: label = test_data.labels[idx] _idx = idx + bsz if label else min(idx + bsz, sep_id) _idx = min(_idx, n_samples) text, _ = test_data._to_tensor(test_data.data[idx:_idx], batch_first=False, device=device) z, _ = model.vae.encoder(text) ori_z = z.clone() tmp = max_v if label == sign else min_v if args.type > 0: z[:, best_idx] += torch.ones(text.shape[1]).to(device) * tmp texts = model.vae.decoder.beam_search_decode(z) for text in texts: f.write("%d\t%s\n" % (1 - label, " ".join(text[1:-1]))) for i in range(_idx - idx): ori_logps.append( cal_log_density(mus, logvars, ori_z[i:i + 1].cpu())) tra_logps.append( cal_log_density(mus, logvars, z[i:i + 1].cpu())) idx = _idx step += 1 if step % 100 == 0: print(step, idx) with open(os.path.join(args.load_path, "nll_%d.txt" % args.type), "w") as f: for x, y in zip(ori_logps, tra_logps): f.write("%f\t%f\n" % (x, y))
def main(args): print("Entering eval_preds.py...") data_pth = "results/%s" % args.data_name train_pth = os.path.join( data_pth, "_train_whole_data.txt") #Default vocab is taken from train data train_data = MonoTextData(train_pth, False, vocab=100000) vocab = train_data.vocab device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') source_pth = os.path.join( data_pth, args.source_file_name) #Classify the given source file's contents print("Classifying data in ", source_pth) source_data = MonoTextData(source_pth, False, vocab=100000) source_data_vocab = source_data.vocab source_data = source_data.create_data_batch(64, device, batch_first=True) target_pth = "results/%s" % args.data_name target_pth = os.path.join( target_pth, args.target_file_name) #save the generated output into the target file source = pd.read_csv(source_pth, sep="\n", header=None) source.columns = ["content"] #target = pd.read_csv(target_pth, names=['content','sentiment-label','tense-label'], sep='\t') target = pd.DataFrame( columns=['content', 'sentiment-label', 'tense-label']) target.head() # Classification for style in ["tense", "sentiment"]: #model = CNNClassifier(len(vocab), 300, [1,2,3,4,5], 500, 0.5).to(device) print("Classifying ", style) model_path = "checkpoint/{}-{}-classifier.pt".format( args.data_name, style) checkpoint = torch.load(model_path) #model = CNNClassifier(len(checkpoint['embedding.weight']), 300, [1,2,3,4,5], 500, 0.5).to(device) print(len(checkpoint['embedding.weight']), len(source_data_vocab)) model = CNNClassifier(len(checkpoint['embedding.weight']), 300, [1, 2, 3, 4, 5], 500, 0.5).to(device) model.load_state_dict(checkpoint) #break model.eval() content = [] predictions = [] with torch.no_grad(): print("Number of batches = ", len(source_data)) idx = 0 for batch_data in source_data: print("Evaluating batch ", idx) logits = model(batch_data) probs = torch.sigmoid(logits) y_hat = list((probs > 0.5).long().cpu().numpy()) predictions.extend(y_hat) idx = idx + 1 #break label = "{}-label".format(style) #print("Number of sentences = ",len(content)) print("Length of predictions = ", len(predictions)) #print(predictions) target['content'] = source["content"] # print("Content:") # print(target['content']) target[label] = predictions #print("Predictions:") #print(target[label]) print("No of sentences = ", len(target)) print(target.head()) target.to_csv(target_pth, sep='\t') print("Output written to ", target_pth)
def main(args): print("Entering eval_preds.py...") data_pth = "data/%s" % args.data_name temp = "_train_%s_data.txt" % args.style train_pth = os.path.join(data_pth, temp) #Default vocab is taken from train data train_data = MonoTextData(train_pth, False, vocab=100000) vocab = train_data.vocab device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') source_pth = os.path.join( data_pth, args.source_file_name) #Classify the given source file's contents print("Classifying data in ", source_pth) source_data = MonoTextData(source_pth, True, vocab=100000) source_data_vocab = source_data.vocab source_data = source_data.create_data_batch(64, device, batch_first=True) target_pth = "results/%s" % args.data_name target_pth = os.path.join( target_pth, args.target_file_name) #save the generated output into the target file source = pd.read_csv(source_pth, sep="\t", header=None) source.columns = ["label", "content"] #target = pd.read_csv(target_pth, names=['content','sentiment-label','tense-label'], sep='\t') target = pd.DataFrame( columns=['content', 'sentiment-label', 'tense-label']) target.head() # Classification if args.style == "sentiment": #model = CNNClassifier(len(vocab), 300, [1,2,3,4,5], 500, 0.5).to(device) print("Classifying tense on given sentiment labeled data") model_path = "checkpoint/{}-{}-classifier.pt".format( args.data_name, "tense") checkpoint = torch.load(model_path) #model = CNNClassifier(len(checkpoint['embedding.weight']), 300, [1,2,3,4,5], 500, 0.5).to(device) print(len(checkpoint['embedding.weight']), len(source_data_vocab)) model = CNNClassifier(len(checkpoint['embedding.weight']), 300, [1, 2, 3, 4, 5], 500, 0.5).to(device) model.load_state_dict(checkpoint) #break model.eval() content = [] predictions = [] with torch.no_grad(): print("Number of batches = ", len(source_data)) idx = 0 for batch_data in source_data: print("Evaluating batch ", idx) logits = model(batch_data) probs = torch.sigmoid(logits) #prob(1) # y_hat = list((probs > 0.5).long().cpu().numpy()) # predictions.extend(y_hat) #retaining probability values itself so that we can threshold later and remove less confident sentences predictions.extend(list(probs.cpu().numpy())) idx = idx + 1 #break label = "{}-label".format("tense") #print("Number of sentences = ",len(content)) print("Length of predictions = ", len(predictions)) #print(predictions) # print("Content:") # print(target['content']) final_content = [] final_sentiment_label = [] final_tense_label = [] i = 0 for pred in predictions: pred_1 = pred #prob(1) 0.3 0.8 pred_0 = 1 - pred_1 #prob(0) 0.7 0.2 if pred_1 >= args.confidence or pred_0 >= args.confidence: #model is 80% confidently predicting at least one label, so retain the sentence if pred_1 >= args.confidence: final_tense_label.append(1) else: final_tense_label.append(0) final_content.append(source["content"].get(i)) final_sentiment_label.append(source["label"].get(i)) i = i + 1 target['content'] = final_content #source["content"] target[label] = final_tense_label #predictions #print("Predictions:") #print(target[label]) target['sentiment-label'] = final_sentiment_label #source["label"] print( "No of sentences, after retaining only 80% confident predictions = ", len(target)) print(target.head()) else: print("Classifying sentiment on tense labeled data") model_path = "checkpoint/{}-{}-classifier.pt".format( args.data_name, "sentiment") checkpoint = torch.load(model_path) #model = CNNClassifier(len(checkpoint['embedding.weight']), 300, [1,2,3,4,5], 500, 0.5).to(device) print(len(checkpoint['embedding.weight']), len(source_data_vocab)) model = CNNClassifier(len(checkpoint['embedding.weight']), 300, [1, 2, 3, 4, 5], 500, 0.5).to(device) model.load_state_dict(checkpoint) #break model.eval() content = [] predictions = [] with torch.no_grad(): print("Number of batches = ", len(source_data)) idx = 0 for batch_data in source_data: print("Evaluating batch ", idx) logits = model(batch_data) probs = torch.sigmoid(logits) # y_hat = list((probs > 0.5).long().cpu().numpy()) # predictions.extend(y_hat) #retaining probability values itself so that we can threshold later and remove less confident sentences predictions.extend(list(probs.float().cpu().numpy())) idx = idx + 1 #break label = "{}-label".format("sentiment") #print("Number of sentences = ",len(content)) print("Length of predictions = ", len(predictions)) final_content = [] final_sentiment_label = [] final_tense_label = [] i = 0 for pred in predictions: pred_1 = pred #prob(1) 0.3 0.8 pred_0 = 1 - pred_1 #prob(0) 0.7 0.2 if pred_1 >= args.confidence or pred_0 >= args.confidence: #model is 80% confidently predicting at least one label, so retain the sentence if pred_1 >= args.confidence: final_sentiment_label.append(1) else: final_sentiment_label.append(0) final_content.append(source["content"].get(i)) final_tense_label.append(source["label"].get(i)) i = i + 1 #print(predictions) target['content'] = final_content #source["content"] # print("Content:") # print(target['content']) target[label] = final_sentiment_label #predictions #print("Predictions:") #print(target[label]) target['tense-label'] = final_tense_label #source["label"] print( "No of sentences, after retaining only 80% confident predictions = ", len(target)) print(target.head()) target.to_csv(target_pth, sep='\t') print("Output written to ", target_pth)
def main(args): conf = config.CONFIG[args.data_name] data_pth = "data/%s" % args.data_name train_sentiment_data_pth = os.path.join(data_pth, "train_sentiment_data.txt") train_sentiment_feat_pth = os.path.join( data_pth, "train_sentiment_%s.npy" % args.feat) train_sentiment_data = MonoTextData(train_sentiment_data_pth, True) train_sentiment_feat = np.load(train_sentiment_feat_pth) train_tense_data_pth = os.path.join(data_pth, "train_tense_data.txt") train_tense_feat_pth = os.path.join(data_pth, "train_tense_%s.npy" % args.feat) train_tense_data = MonoTextData(train_tense_data_pth, True) train_tense_feat = np.load(train_tense_feat_pth) sentiment_vocab = train_sentiment_data.vocab print('Sentiment Vocabulary size: %d' % len(sentiment_vocab)) tense_vocab = train_tense_data.vocab print('Tense Vocabulary size: %d' % len(tense_vocab)) dev_sentiment_data_pth = os.path.join(data_pth, "dev_sentiment_data.txt") dev_sentiment_feat_pth = os.path.join(data_pth, "dev_sentiment_%s.npy" % args.feat) dev_sentiment_data = MonoTextData(dev_sentiment_data_pth, True, vocab=sentiment_vocab) dev_sentiment_feat = np.load(dev_sentiment_feat_pth) dev_tense_data_pth = os.path.join(data_pth, "dev_tense_data.txt") dev_tense_feat_pth = os.path.join(data_pth, "dev_tense_%s.npy" % args.feat) dev_tense_data = MonoTextData(dev_tense_data_pth, True, vocab=tense_vocab) dev_tense_feat = np.load(dev_tense_feat_pth) test_sentiment_data_pth = os.path.join(data_pth, "test_sentiment_data.txt") test_sentiment_feat_pth = os.path.join(data_pth, "test_sentiment_%s.npy" % args.feat) test_sentiment_data = MonoTextData(test_sentiment_data_pth, True, vocab=sentiment_vocab) test_sentiment_feat = np.load(test_sentiment_feat_pth) test_tense_data_pth = os.path.join(data_pth, "test_tense_data.txt") test_tense_feat_pth = os.path.join(data_pth, "test_tense_%s.npy" % args.feat) test_tense_data = MonoTextData(test_tense_data_pth, True, vocab=tense_vocab) test_tense_feat = np.load(test_tense_feat_pth) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") save_path0 = 'sentiment-{}-{}-{}'.format(args.save, args.data_name, args.feat) save_path0 = os.path.join(save_path0, time.strftime("%Y%m%d-%H%M%S")) save_path1 = 'tense-{}-{}-{}'.format(args.save, args.data_name, args.feat) save_path1 = os.path.join(save_path1, time.strftime("%Y%m%d-%H%M%S")) scripts_to_save = [ 'run.py', 'models/decomposed_vae.py', 'models/vae.py', 'models/base_network.py', 'config.py' ] logging0 = create_exp_dir(save_path0, scripts_to_save=scripts_to_save, debug=args.debug) logging1 = create_exp_dir(save_path1, scripts_to_save=scripts_to_save, debug=args.debug) if args.text_only: train_sentiment = train_sentiment_data.create_data_batch( args.bsz, device) dev_sentiment = dev_sentiment_data.create_data_batch(args.bsz, device) test_sentiment = test_sentiment_data.create_data_batch( args.bsz, device) feat_sentiment = train_sentiment train_tense = train_tense_data.create_data_batch(args.bsz, device) test_tense = test_tense_data.create_data_batch(args.bsz, device) feat_tense = train_tense else: train_sentiment = train_sentiment_data.create_data_batch_feats( args.bsz, train_sentiment_feat, device) dev_sentiment = dev_sentiment_data.create_data_batch_feats( args.bsz, dev_sentiment_feat, device) test_sentiment = test_sentiment_data.create_data_batch_feats( args.bsz, test_sentiment_feat, device) feat_sentiment = train_sentiment_feat train_tense = train_tense_data.create_data_batch_feats( args.bsz, train_tense_feat, device) test_tense = test_tense_data.create_data_batch_feats( args.bsz, test_tense_feat, device) feat_tense = train_tense_feat #VAE training on sentiment data # kwargs0 = { # "train": train_sentiment, # "valid": dev_sentiment, # "test": test_sentiment, # "feat": feat_sentiment, # "bsz": args.bsz, # "save_path": save_path0, # "logging": logging0, # "text_only": args.text_only, # } # params = conf["params"] # params["vae_params"]["vocab"] = sentiment_vocab # params["vae_params"]["device"] = device # params["vae_params"]["text_only"] = args.text_only # params["vae_params"]["mlp_ni"] = train_sentiment_feat.shape[1] # kwargs0 = dict(kwargs0, **params) # sentiment_model = DecomposedVAE(**kwargs0) # try: # valid_loss = sentiment_model.fit() # logging("sentiment val loss : {}".format(valid_loss)) # except KeyboardInterrupt: # logging("Exiting from training early") # sentiment_model.load(save_path0) # test_loss = model.evaluate(sentiment_model.test_data, sentiment_model.test_feat) # logging("sentiment test loss: {}".format(test_loss[0])) # logging("sentiment test recon: {}".format(test_loss[1])) # logging("sentiment test kl1: {}".format(test_loss[2])) # logging("sentiment test kl2: {}".format(test_loss[3])) # logging("sentiment test mi1: {}".format(test_loss[4])) # logging("sentiment test mi2: {}".format(test_loss[5])) #VAE training on tense data kwargs1 = { "train": train_tense, "valid": test_tense, "test": test_tense, "feat": feat_tense, "bsz": args.bsz, "save_path": save_path1, "logging": logging1, "text_only": args.text_only, } params = conf["params"] params["vae_params"]["vocab"] = tense_vocab params["vae_params"]["device"] = device params["vae_params"]["text_only"] = args.text_only params["vae_params"]["mlp_ni"] = train_tense_feat.shape[1] kwargs1 = dict(kwargs1, **params) tense_model = DecomposedVAE(**kwargs1) try: valid_loss = tense_model.fit() logging("tense val loss : {}".format(valid_loss)) except KeyboardInterrupt: logging("Exiting from training early") tense_model.load(save_path1) test_loss = model.evaluate(tense_model.test_data, tense_model.test_feat) logging("tense test loss: {}".format(test_loss[0])) logging("tense test recon: {}".format(test_loss[1])) logging("tense test kl1: {}".format(test_loss[2])) logging("tense test kl2: {}".format(test_loss[3])) logging("tense test mi1: {}".format(test_loss[4])) logging("tense test mi2: {}".format(test_loss[5]))
def main(args): conf = config.CONFIG[args.data_name] data_pth = "data/%s" % args.data_name train_data_pth = os.path.join(data_pth, "train_data.txt") train_feat_pth = os.path.join(data_pth, "train_%s.npy" % args.feat) train_data = MonoTextData(train_data_pth, True) train_feat = np.load(train_feat_pth) vocab = train_data.vocab dev_data_pth = os.path.join(data_pth, "dev_data.txt") dev_feat_pth = os.path.join(data_pth, "dev_%s.npy" % args.feat) dev_data = MonoTextData(dev_data_pth, True, vocab=vocab) dev_feat = np.load(dev_feat_pth) test_data_pth = os.path.join(data_pth, "test_data.txt") test_feat_pth = os.path.join(data_pth, "test_%s.npy" % args.feat) test_data = MonoTextData(test_data_pth, True, vocab=vocab) test_feat = np.load(test_feat_pth) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") kwargs = { "train": ([1], None), "valid": (None, None), "test": (None, None), "feat": None, "bsz": 32, "save_path": args.load_path, "logging": None, "text_only": args.text_only, } params = conf["params"] params["vae_params"]["vocab"] = vocab params["vae_params"]["device"] = device params["vae_params"]["text_only"] = args.text_only params["vae_params"]["mlp_ni"] = dev_feat.shape[1] kwargs = dict(kwargs, **params) model = DecomposedVAE(**kwargs) model.load(args.load_path) model.vae.eval() train_data, train_feat = train_data.create_data_batch_feats( 32, train_feat, device) print("Collecting training distributions...") mus, logvars = [], [] step = 0 for batch_data, batch_feat in zip(train_data, train_feat): mu1, logvar1 = model.vae.lstm_encoder(batch_data) mu2, logvar2 = model.vae.mlp_encoder(batch_feat) r, _ = model.vae.mlp_encoder(batch_feat, True) p = model.vae.get_var_prob(r) mu = torch.cat([mu1, mu2], -1) logvar = torch.cat([logvar1, logvar2], -1) mus.append(mu.detach().cpu()) logvars.append(logvar.detach().cpu()) step += 1 if step % 100 == 0: torch.cuda.empty_cache() mus = torch.cat(mus, 0) logvars = torch.cat(logvars, 0) if args.text_only: neg_sample = dev_data.data[:10] neg_inputs, _ = dev_data._to_tensor(neg_sample, batch_first=False, device=device) else: neg_sample = dev_feat[:10] neg_inputs = torch.tensor(neg_sample, dtype=torch.float, requires_grad=False, device=device) r, _ = model.vae.mlp_encoder(neg_inputs, True) p = model.vae.get_var_prob(r).mean(0, keepdim=True) neg_idx = torch.max(p, 1)[1].item() if args.text_only: pos_sample = dev_data.data[-10:] pos_inputs, _ = dev_data._to_tensor(pos_sample, batch_first=False, device=device) else: pos_sample = dev_feat[-10:] pos_inputs = torch.tensor(pos_sample, dtype=torch.float, requires_grad=False, device=device) r, _ = model.vae.mlp_encoder(pos_inputs, True) p = model.vae.get_var_prob(r).mean(0, keepdim=True) top2 = torch.topk(p, 2, 1)[1].squeeze() if top2[0].item() == neg_idx: print("Collision!!! Use second most as postive.") pos_idx = top2[1].item() else: pos_idx = top2[0].item() other_idx = -1 for i in range(3): if i not in [pos_idx, neg_idx]: other_idx = i break print("Negative: %d" % neg_idx) print("Positive: %d" % pos_idx) sep_id = -1 for idx, x in enumerate(test_data.labels): if x == 1: sep_id = idx break bsz = 64 ori_logps = [] tra_logps = [] pos_z2 = model.vae.mlp_encoder.var_embedding[pos_idx:pos_idx + 1] neg_z2 = model.vae.mlp_encoder.var_embedding[neg_idx:neg_idx + 1] other_z2 = model.vae.mlp_encoder.var_embedding[other_idx:other_idx + 1] _, d0 = get_coordinates(pos_z2[0], neg_z2[0], other_z2[0]) ori_obs = [] tra_obs = [] with open(os.path.join(args.load_path, 'generated_results.txt'), "w") as f: idx = 0 step = 0 n_samples = len(test_data.labels) while idx < n_samples: label = test_data.labels[idx] _idx = idx + bsz if label else min(idx + bsz, sep_id) _idx = min(_idx, n_samples) var_id = neg_idx if label else pos_idx text, _ = test_data._to_tensor(test_data.data[idx:_idx], batch_first=False, device=device) feat = torch.tensor(test_feat[idx:_idx], dtype=torch.float, requires_grad=False, device=device) z1, _ = model.vae.lstm_encoder(text[:min(text.shape[0], 10)]) ori_z2, _ = model.vae.mlp_encoder(feat) tra_z2 = model.vae.mlp_encoder.var_embedding[var_id:var_id + 1, :].expand( _idx - idx, -1) texts = model.vae.decoder.beam_search_decode(z1, tra_z2) for text in texts: f.write("%d\t%s\n" % (1 - label, " ".join(text[1:-1]))) ori_z = torch.cat([z1, ori_z2], -1) tra_z = torch.cat([z1, tra_z2], -1) for i in range(_idx - idx): ori_logps.append( cal_log_density(mus, logvars, ori_z[i:i + 1].cpu())) tra_logps.append( cal_log_density(mus, logvars, tra_z[i:i + 1].cpu())) idx = _idx step += 1 if step % 100 == 0: print(step, idx) with open(os.path.join(args.load_path, 'nll.txt'), "w") as f: for x, y in zip(ori_logps, tra_logps): f.write("%f\t%f\n" % (x, y))
def main(args): data_pth = "results/%s" % args.data_name train_pth = os.path.join(data_pth, ("train_identical_{}_{}.txt").format( str(args.confidence + 10), args.style)) dev_pth = os.path.join(data_pth, ("dev_identical_{}_{}.txt").format( str(args.confidence + 10), args.style)) test_pth = os.path.join(data_pth, ("test_identical_{}_{}.txt").format( str(args.confidence + 10), args.style)) train_data = MonoTextData(train_pth, True, vocab=100000) print("Training data = ", train_pth) vocab = train_data.vocab dev_data = MonoTextData(dev_pth, True, vocab=vocab) test_data = MonoTextData(test_pth, True, vocab=vocab) path = "checkpoint/{}-identical-{}-{}-classifier.pt".format( str(args.confidence), args.data_name, args.style) #path = "checkpoint/%s-classifier.pt" % args.data_name glove_embed = np.zeros((len(vocab), 300)) with open("data/glove.840B.300d.txt") as f: for line in f: word, vec = line.split(' ', 1) if word in vocab: wid = vocab[word] glove_embed[wid, :] = np.fromstring(vec, sep=' ', dtype=np.float32) _mu = glove_embed.mean() _std = glove_embed.std() glove_embed[:4, :] = np.random.randn(4, 300) * _std + _mu device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_batch, train_label = train_data.create_data_batch_labels( 64, device, batch_first=True) dev_batch, dev_label = dev_data.create_data_batch_labels(64, device, batch_first=True) test_batch, test_label = test_data.create_data_batch_labels( 64, device, batch_first=True) model = CNNClassifier(len(vocab), 300, [1, 2, 3, 4, 5], 500, 0.5).to(device) optimizer = optim.Adam(model.parameters(), lr=5e-4) nbatch = len(train_batch) best_acc = 0.0 step = 0 with torch.no_grad(): model.embedding.weight.fill_(0.) model.embedding.weight += torch.FloatTensor(glove_embed).to(device) for epoch in range(args.max_epochs): for idx in np.random.permutation(range(nbatch)): batch_data = train_batch[idx] batch_label = train_label[idx] batch_label = torch.tensor(batch_label, dtype=torch.float, requires_grad=False, device=device) optimizer.zero_grad() logits = model(batch_data) loss = F.binary_cross_entropy_with_logits(logits, batch_label) loss.backward() optimizer.step() step += 1 #print("step = ",step) if step % 1000 == 0: print('Loss: %2f' % loss.item()) model.eval() acc = evaluate(model, dev_batch, dev_label) model.train() print('Valid Acc: %.2f' % acc) if acc > best_acc: best_acc = acc print('saving to %s' % path) torch.save(model.state_dict(), path) model.load_state_dict(torch.load(path)) model.eval() acc = evaluate(model, test_batch, test_label) print('Test Acc: %.2f' % acc)
def main(args): conf = config.CONFIG[args.data_name] data_pth = "data/%s" % args.data_name train_data_pth = os.path.join(data_pth, "train_input_data.csv") train_feat_pth = os.path.join(data_pth, "train_%s.npy" % args.feat) train_data = MonoTextData(train_data_pth, True) train_feat = np.load(train_feat_pth) vocab = train_data.vocab print('Vocabulary size: %d' % len(vocab)) dev_data_pth = os.path.join(data_pth, "dev_input_data.csv") dev_feat_pth = os.path.join(data_pth, "dev_%s.npy" % args.feat) dev_data = MonoTextData(dev_data_pth, True, vocab=vocab) dev_feat = np.load(dev_feat_pth) test_data_pth = os.path.join(data_pth, "test_input_data.csv") test_feat_pth = os.path.join(data_pth, "test_%s.npy" % args.feat) test_data = MonoTextData(test_data_pth, True, vocab=vocab) test_feat = np.load(test_feat_pth) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") save_path = '{}-{}-{}'.format(args.save, args.data_name, args.feat) save_path = os.path.join(save_path, time.strftime("%Y%m%d-%H%M%S")) scripts_to_save = [ 'run.py', 'models/decomposed_vae.py', 'models/vae.py', 'models/base_network.py', 'config.py' ] logging = create_exp_dir(save_path, scripts_to_save=scripts_to_save, debug=args.debug) if args.text_only: train, train_sentiments, train_tenses = train_data.create_data_batch_labels( args.bsz, device) dev, dev_sentiments, dev_tenses = dev_data.create_data_batch_labels( args.bsz, device) test, test_sentiments, test_tenses = test_data.create_data_batch_labels( args.bsz, device) feat = train else: train = train_data.create_data_batch_feats(args.bsz, train_feat, device) dev = dev_data.create_data_batch_feats(args.bsz, dev_feat, device) test = test_data.create_data_batch_feats(args.bsz, test_feat, device) feat = train_feat print("data done.") kwargs = { "train": train, "valid": dev, "test": test, "train_sentiments": train_sentiments, "train_tenses": train_tenses, "dev_sentiments": dev_sentiments, "dev_tenses": dev_tenses, "test_sentiments": test_sentiments, "test_tenses": test_tenses, "feat": feat, "bsz": args.bsz, "save_path": save_path, "logging": logging, "text_only": args.text_only, } params = conf["params"] params["vae_params"]["vocab"] = vocab params["vae_params"]["device"] = device params["vae_params"]["text_only"] = args.text_only params["vae_params"]["mlp_ni"] = train_feat.shape[1] kwargs = dict(kwargs, **params) model = DecomposedVAE(**kwargs) try: valid_loss = model.fit() logging("val loss : {}".format(valid_loss)) except KeyboardInterrupt: logging("Exiting from training early") model.load(save_path) test_loss = model.evaluate(model.test_data, model.test_feat) logging("test loss: {}".format(test_loss[0])) logging("test recon: {}".format(test_loss[1])) logging("test kl1: {}".format(test_loss[2])) logging("test kl2: {}".format(test_loss[3])) logging("test mi1: {}".format(test_loss[4])) logging("test mi2: {}".format(test_loss[5]))
def main(args): conf = config.CONFIG[args.data_name] data_pth = "data/%s" % args.data_name train_data_pth = os.path.join(data_pth, "train_data.txt") train_data = MonoTextData(train_data_pth, True) vocab = train_data.vocab dev_data_pth = os.path.join(data_pth, "dev_data.txt") dev_data = MonoTextData(dev_data_pth, True, vocab=vocab) test_data_pth = os.path.join(data_pth, "test_data.txt") test_data = MonoTextData(test_data_pth, True, vocab=vocab) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") kwargs = { "train": [1], "valid": None, "test": None, "bsz": 32, "save_path": args.load_path, "logging": None, } params = conf["params"] params["vae_params"]["vocab"] = vocab params["vae_params"]["device"] = device kwargs = dict(kwargs, **params) model = AgressiveVAE(**kwargs) model.load(args.load_path) model.vae.eval() bsz = 64 zs = [] idx = 0 step = 0 n_samples = len(train_data.labels) n = 10000 selected_index = np.random.permutation(np.arange(n_samples))[:n] while idx < n: label = train_data.labels[idx] _idx = idx + bsz _idx = min(_idx, n) inputs = [] for i in range(idx, _idx): inputs.append(train_data.data[selected_index[i]]) text, _ = train_data._to_tensor(inputs, batch_first=False, device=device) z, _ = model.vae.encode(text, 10) z = z.squeeze().cpu().detach().numpy() zs.append(z[:, :, :16].reshape(-1, 16)) idx = _idx step += 1 if step % 100 == 0: print(step, idx) zs = np.vstack(zs) mapper = km.KeplerMapper(verbose=1) z_embed = mapper.fit_transform(zs, projection='sum') graph = mapper.map(z_embed, zs, clusterer=sklearn.cluster.DBSCAN(eps=0.1, min_samples=3, metric='cosine'), cover=km.Cover(n_cubes=args.resolution, perc_overlap=0.4)) mapper.visualize(graph, path_html='plot/tda_baseline.html', title='tda baseline')