def main(args): with open(args.data_dir + '/ptb.vocab.json', 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] model = SentenceVAE(vocab_size=len(w2i), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], unk_idx=w2i['<unk>'], max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional) if not os.path.exists(args.load_checkpoint): raise FileNotFoundError(args.load_checkpoint) model.load_state_dict(torch.load(args.load_checkpoint)) print("Model loaded from %s" % args.load_checkpoint) if torch.cuda.is_available(): model = model.cuda() model.eval() # samples, z = model.inference(n=args.num_samples) # print('----------SAMPLES----------') # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') # z_ = torch.randn([args.latent_size]).numpy() # input_sent = "the n stock specialist firms on the big board floor the buyers and sellers of last resort who were criticized after the n crash once again could n't handle the selling pressure" input_sent = "looking for a job was one of the most anxious periods of my life and is for most people" batch_input = torch.LongTensor([[w2i[i] for i in input_sent.split()]]).cuda() batch_len = torch.LongTensor([len(input_sent.split())]).cuda() input_mean = model(batch_input, batch_len, output_mean=True) z_ = input_mean.cpu().detach().numpy() print(z_.shape) # z2 = torch.randn([args.latent_size]).numpy() for i in range(args.latent_size): print(f"-------Dimension {i}------") z1, z2 = z_.copy(), z_.copy() z1[i] -= 0.5 z2[i] += 0.5 z = to_var( torch.from_numpy(interpolate(start=z1, end=z2, steps=5)).float()) samples, _ = model.inference(z=z) print('-------INTERPOLATION-------') print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
def main(args): with open(args.data_dir + '/poems.vocab.json', 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] model = SentenceVAE(vocab_size=len(w2i), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], unk_idx=w2i['<unk>'], max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional, condition_size=0) if not os.path.exists(args.load_checkpoint): raise FileNotFoundError(args.load_checkpoint) model.load_state_dict( torch.load(args.load_checkpoint, map_location=torch.device('cpu'))) print("Model loaded from %s" % (args.load_checkpoint)) if torch.cuda.is_available(): model = model.cuda() model.eval() samples, z = model.inference(n=args.num_samples) print('----------SAMPLES----------') print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') # while True: # samples, z = model.inference(n=1, condition=torch.Tensor([[1, 0, 0, 0, 0, 0, 0]]).cuda()) # poem = idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>'])[0] # if 'love' in poem: # breakpoint() z1 = torch.randn([args.latent_size]).numpy() z2 = torch.randn([args.latent_size]).numpy() z = to_var( torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float()) # samples, _ = model.inference(z=z, condition=torch.Tensor([[1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0]]).cuda()) samples, _ = model.inference(z=z) print('-------INTERPOLATION-------') print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
def main(args): with open(args.data_dir+'/ptb.vocab.json', 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] model = SentenceVAE( vocab_size=len(w2i), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], unk_idx=w2i['<unk>'], max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) if not os.path.exists(args.load_checkpoint): raise FileNotFoundError(args.load_checkpoint) model.load_state_dict(torch.load(args.load_checkpoint)) print("Model loaded from %s"%(args.load_checkpoint)) if torch.cuda.is_available(): model = model.cuda() model.eval() # samples, z = model.inference(n=args.num_samples) # print('----------SAMPLES----------') # print(idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>'])) z1 = torch.randn([args.latent_size]).numpy() z2 = torch.randn([args.latent_size]).numpy() z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float()) samples, _ = model.inference(z=z) print('-------INTERPOLATION-------') print(idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>'])) model.load_state_dict(torch.load('bin/2019-May-16-04:24:16/E10.pytorch')) z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float()) samples, _ = model.inference(z=z) print('-------INTERPOLATION-------') print(idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']))
def load_vae_model_from_args(args): with open(args.data_dir+'/ptb.vocab.json', 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] model = SentenceVAE( vocab_size=len(w2i), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], unk_idx=w2i['<unk>'], max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) tokenizer = DefaultTokenizer() if not os.path.exists(args.load_checkpoint): raise FileNotFoundError(args.load_checkpoint) model.load_state_dict(torch.load(args.load_checkpoint)) print("Model loaded from %s" % args.load_checkpoint) if torch.cuda.is_available(): model = model.cuda() model.eval() return { 'model': model, 'tokenizer': tokenizer, 'w2i': w2i, 'i2w': i2w, }
def main(args): data_name = args.data_name with open(args.data_dir+data_name+'.vocab.json', 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] model = SentenceVAE( vocab_size=len(w2i), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], unk_idx=w2i['<unk>'], max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) if not os.path.exists(args.load_checkpoint): raise FileNotFoundError(args.load_checkpoint) model.load_state_dict(torch.load(args.load_checkpoint)) print("Model loaded from %s"%(args.load_checkpoint)) if torch.cuda.is_available(): model = model.cuda() model.eval() samples, z = model.inference(n=args.num_samples) print('----------SAMPLES----------') print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') z1 = torch.randn([args.latent_size]).numpy() z2 = torch.randn([args.latent_size]).numpy() z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float()) samples, _ = model.inference(z=z) print('-------INTERPOLATION-------') print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') print('-------Encode ... Decode-------') datasets = Amazon( data_dir=args.data_dir, split="valid", create_data=False, batch_size=10, max_sequence_length=args.max_sequence_length, min_occ=3 ) iteration = 0 for input_batch_tensor, target_batch_tensor, length_batch_tensor in datasets: if torch.is_tensor(input_batch_tensor): input_batch_tensor = to_var(input_batch_tensor) if torch.is_tensor(target_batch_tensor): target_batch_tensor = to_var(target_batch_tensor) if torch.is_tensor(length_batch_tensor): length_batch_tensor = to_var(length_batch_tensor) print("*"*10) print("->"*10, *idx2word(input_batch_tensor, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') logp, mean, logv, z = model(input_batch_tensor,length_batch_tensor) samples, z = model.inference(z=z) print("<-"*10, *idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') # print("+"*10) if iteration == 0: break iteration += 1
def main(args): with open(args.data_dir+'/ptb.vocab.json', 'r') as file: vocab = json.load(file) # required to map between integer-value sentences and real sentences w2i, i2w = vocab['w2i'], vocab['i2w'] # make sure our models for the VAE and Actor exist if not os.path.exists(args.load_vae): raise FileNotFoundError(args.load_vae) model = SentenceVAE( vocab_size=len(w2i), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], unk_idx=w2i['<unk>'], max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) model.load_state_dict( torch.load(args.load_vae, map_location=lambda storage, loc: storage)) model.eval() print("vae model loaded from %s"%(args.load_vae)) # to run in constraint mode, we need the trained generator if args.constraint_mode: if not os.path.exists(args.load_actor): raise FileNotFoundError(args.load_actor) actor = Actor( dim_z=args.latent_size, dim_model=2048, num_labels=args.n_tags) actor.load_state_dict( torch.load(args.load_actor, map_location=lambda storage, loc:storage)) actor.eval() print("actor model loaded from %s"%(args.load_actor)) if torch.cuda.is_available(): model = model.cuda() if args.constraint_mode: actor = actor.cuda() # TODO: to(self.devices) if args.sample: print('*** SAMPLE Z: ***') # get samples from the prior sample_sents, z = model.inference(n=args.num_samples) sample_sents, sample_tags = get_sents_and_tags(sample_sents, i2w, w2i) pickle_it(z.cpu().numpy(), 'samples/z_sample_n{}.pkl'.format(args.num_samples)) pickle_it(sample_sents, 'samples/sents_sample_n{}.pkl'.format(args.num_samples)) pickle_it(sample_tags, 'samples/tags_sample_n{}.pkl'.format(args.num_samples)) print(sample_sents, sep='\n') if args.constraint_mode: print('*** SAMPLE Z_PRIME: ***') # get samples from the prior, conditioned via the actor all_tags_sample_prime = [] all_sents_sample_prime = {} all_z_sample_prime = {} for i, condition in enumerate(LABELS): # binary vector denoting each of the PHRASE_TAGS labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda() # take z and manipulate using the actor to generate z_prime z_prime = actor.forward(z, labels) sample_sents_prime, z_prime = model.inference( z=z_prime, n=args.num_samples) sample_sents_prime, sample_tags_prime = get_sents_and_tags( sample_sents_prime, i2w, w2i) print('conditoned on: {}'.format(condition)) print(sample_sents_prime, sep='\n') all_tags_sample_prime.append(sample_tags_prime) all_sents_sample_prime[LABEL_NAMES[i]] = sample_sents_prime all_z_sample_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy() pickle_it(all_tags_sample_prime, 'samples/tags_sample_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_sents_sample_prime, 'samples/sents_sample_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_z_sample_prime, 'samples/z_sample_prime_n{}.pkl'.format(args.num_samples)) if args.interpolate: # get random samples from the latent space z1 = torch.randn([args.latent_size]).numpy() z2 = torch.randn([args.latent_size]).numpy() z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=args.num_samples-2)).float()) print('*** INTERP Z: ***') interp_sents, _ = model.inference(z=z) interp_sents, interp_tags = get_sents_and_tags(interp_sents, i2w, w2i) pickle_it(z.cpu().numpy(), 'samples/z_interp_n{}.pkl'.format(args.num_samples)) pickle_it(interp_sents, 'samples/sents_interp_n{}.pkl'.format(args.num_samples)) pickle_it(interp_tags, 'samples/tags_interp_n{}.pkl'.format(args.num_samples)) print(interp_sents, sep='\n') if args.constraint_mode: print('*** INTERP Z_PRIME: ***') all_tags_interp_prime = [] all_sents_interp_prime = {} all_z_interp_prime = {} for i, condition in enumerate(LABELS): # binary vector denoting each of the PHRASE_TAGS labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda() # z prime conditioned on this particular binary variable z_prime = actor.forward(z, labels) interp_sents_prime, z_prime = model.inference( z=z_prime, n=args.num_samples) interp_sents_prime, interp_tags_prime = get_sents_and_tags( interp_sents_prime, i2w, w2i) print('conditoned on: {}'.format(condition)) print(interp_sents_prime, sep='\n') all_tags_interp_prime.append(interp_tags_prime) all_sents_interp_prime[LABEL_NAMES[i]] = interp_sents_prime all_z_interp_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy() pickle_it(all_tags_interp_prime, 'samples/tags_interp_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_sents_interp_prime, 'samples/sents_interp_prime_n{}.pkl'.format(args.num_samples)) pickle_it(all_z_interp_prime, 'samples/z_interp_prime_n{}.pkl'.format(args.num_samples)) import IPython; IPython.embed()
def main(arguments): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # Logistics parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0) parser.add_argument("--seed", help="Random seed", type=int, default=19) parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1) parser.add_argument("--out_dir", help="Dir to write preds to", type=str, default='') parser.add_argument("--log_file", help="File to log to", type=str) parser.add_argument("--load_data", help="0 to read data from scratch", type=int, default=1) # Task options parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str) parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40) # Model options parser.add_argument("--ckpt_path", help="Path to ckpt to load", type=str, default=PATH_PREFIX + 'ckpts/svae/glue_svae/best.mdl') parser.add_argument("--vocab_path", help="Path to vocab to use", type=str, default=PATH_PREFIX + 'processed_data/svae/glue_v2/vocab.json') parser.add_argument("--model", help="Word emb dim", type=str, default='vae') parser.add_argument("--embedding_size", help="Word emb dim", type=int, default=300) parser.add_argument("--word_dropout", help="Word emb dim", type=float, default=0.5) parser.add_argument("--hidden_size", help="RNN size", type=int, default=512) parser.add_argument("--latent_size", help="Latent vector dim", type=int, default=16) parser.add_argument("--num_layers", help="Number of encoder layers", type=int, default=1) parser.add_argument("--bidirectional", help="1 for bidirectional", type=bool, default=False) parser.add_argument("--rnn_type", help="Type of rnn", type=str, choices=['rnn', 'gru'], default='gru') parser.add_argument("--batch_size", help="Batch size to use", type=int, default=64) # Classifier options parser.add_argument("--cls_batch_size", help="Batch size to use", type=int, default=64) args = parser.parse_args(arguments) logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if args.log_file: fileHandler = logging.FileHandler(args.log_file) logging.getLogger().addHandler(fileHandler) logging.info(args) # define senteval params params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': args.use_pytorch, 'kfold': 10, 'max_seq_len': args.max_seq_len, 'batch_size': args.batch_size, 'load_data': args.load_data, 'seed': args.seed} params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': args.cls_batch_size, 'tenacity': 5, 'epoch_size': 4, 'cudaEfficient': True} # Load InferSent model vocab = json.load(open(args.vocab_path, 'r')) args.denoise = False args.prob_swap, args.prob_drop = 0.0, 0.0 if args.model == 'vae': model = SentenceVAE(args, vocab['w2i'], #sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], #max_sequence_length=args.max_seq_len, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional) elif args.model == 'ae': model = SentenceAE(args, vocab['w2i'], embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional) model.load_state_dict(torch.load(args.ckpt_path)) model = model.cuda() model.eval() params_senteval['model'] = model # Do SentEval stuff se = senteval.engine.SE(params_senteval, batcher, prepare) tasks = get_tasks(args.tasks) results = se.eval(tasks) if args.out_dir: write_results(results, args.out_dir) if not args.log_file: print(results) else: logging.info(results)
def main(args): data_name = args.data_name with open(args.data_dir+data_name+'.vocab.json', 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] model = SentenceVAE( vocab_size=len(w2i), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], unk_idx=w2i['<unk>'], max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) if not os.path.exists(args.load_checkpoint): raise FileNotFoundError(args.load_checkpoint) model.load_state_dict(torch.load(args.load_checkpoint)) print("Model loaded from %s"%(args.load_checkpoint)) if torch.cuda.is_available(): model = model.cuda() model.eval() # samples, z = model.inference(n=args.num_samples) # print('----------SAMPLES----------') # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') # z1 = torch.randn([args.latent_size]).numpy() # z2 = torch.randn([args.latent_size]).numpy() # z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float()) # samples, _ = model.inference(z=z) # print('-------INTERPOLATION-------') # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') # print('-------Encode ... Decode-------') # datasets = Amazon( # data_dir=args.data_dir, # split="valid", # create_data=False, # batch_size=10, # max_sequence_length=args.max_sequence_length, # min_occ=3 # ) ### load vocab # with open(os.path.join(args.data_dir, args.vocab_file), 'r') as file: # vocab = json.load(file) # w2i, i2w = vocab['w2i'], vocab['i2w'] tokenizer = TweetTokenizer(preserve_case=False) # raw_text = "I like this!" raw_text = "DON'T CARE FOR IT. GAVE IT AS A GIFT AND THEY WERE OKAY WITH IT. JUST NOT WHAT I EXPECTED." input_text = f_raw2vec(tokenizer, raw_text, w2i, i2w) length_text = len(input_text) length_text = [length_text] print("length_text", length_text) input_tensor = torch.LongTensor(input_text) print('input_tensor', input_tensor) input_tensor = input_tensor.unsqueeze(0) if torch.is_tensor(input_tensor): input_tensor = to_var(input_tensor) length_tensor = torch.LongTensor(length_text) print("length_tensor", length_tensor) # length_tensor = length_tensor.unsqueeze(0) if torch.is_tensor(length_tensor): length_tensor = to_var(length_tensor) print("*"*10) print("->"*10, *idx2word(input_tensor, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') logp, mean, logv, z = model(input_tensor, length_tensor) # print("z", z.size(), mean_z.size()) mean = mean.unsqueeze(0) print("mean", mean) print("z", z) samples, z = model.inference(z=mean) print("<-"*10, *idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') for i in range(10): samples, z = model.inference(z=z) print("<-"*10, *idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] + (['test'] if args.test else []) datasets = OrderedDict() for split in splits: datasets[split] = PTB( data_dir=args.data_dir, split=split, create_data=args.create_data, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ ) model = SentenceVAE( vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, unk_idx=datasets['train'].unk_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) if torch.cuda.is_available(): model = model.cuda() tensor = torch.cuda.FloatTensor else: tensor = torch.Tensor if args.load_checkpoint != 'None': if not os.path.exists(args.load_checkpoint): raise FileNotFoundError(args.load_checkpoint) model.load_state_dict(torch.load(args.load_checkpoint)) latent_vector = defaultdict(tensor) print(model) if args.tensorboard_logging: writer = SummaryWriter(os.path.join(args.logdir, expierment_name(args,ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) if args.file_logging: log_file = open(os.path.join(args.logdir, expierment_name(args,ts)+"_logfile.txt"), "w") save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1/(1+np.exp(-k*(step-x0)))) elif anneal_function == 'linear': return min(1, step/x0) NLL = torch.nn.NLLLoss(size_average=False, ignore_index=datasets['train'].pad_idx) def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).data[0]].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor step = 0 for epoch in range(args.epochs): for split in splits: data_loader = DataLoader( dataset=datasets[split], batch_size=args.batch_size, shuffle=split=='train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available() ) tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': fix_kl_weight = False model.train() else: fix_kl_weight = True model.eval() for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass logp, mean, logv, z = model(batch['input'], batch['length']) if split!='train': latent_vector['latent'] = torch.cat((latent_vector['latent'], z.data)) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step, args.k, args.x0) if fix_kl_weight: KL_weight = 1 loss = (NLL_loss + KL_weight * KL_loss)/batch_size # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # bookkeepeing tracker['ELBO'] = torch.cat((tracker['ELBO'], torch.cuda.FloatTensor([loss.data]))) if args.tensorboard_logging: writer.add_scalar("%s/ELBO"%split.upper(), loss.data[0], epoch*len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss"%split.upper(), NLL_loss.data[0]/batch_size, epoch*len(data_loader) + iteration) writer.add_scalar("%s/KL Loss"%split.upper(), KL_loss.data[0]/batch_size, epoch*len(data_loader) + iteration) writer.add_scalar("%s/KL Weight"%split.upper(), KL_weight, epoch*len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration+1 == len(data_loader): print("%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f" %(split.upper(), iteration, len(data_loader)-1, loss.data[0], NLL_loss.data[0]/batch_size, KL_loss.data[0]/batch_size, KL_weight)) if args.file_logging: log_file.write("%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f \n" %(split.upper(), iteration, len(data_loader)-1, loss.data[0], NLL_loss.data[0]/batch_size, KL_loss.data[0]/batch_size, KL_weight)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word(batch['target'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f"%(split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']))) if args.file_logging: log_file.write("%s Epoch %02d/%i, Mean ELBO %9.4f \n" %(split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']))) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO"%split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = {'target_sents':tracker['target_sents'], 'z':tracker['z'].tolist()} if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/'+ts) with open(os.path.join('dumps/'+ts+'/valid_E%i.json'%epoch), 'w') as dump_file: json.dump(dump,dump_file) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch"%(epoch)) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s"%checkpoint_path) torch.save(latent_vector['latent'], '.latent_vector_{}.pt'.format(epoch)) if args.file_logging: log_file.close()
def generate(date, epoch, sentiment, n_samples): date = date cuda2 = torch.device('cuda:0') epoch = epoch #date = "2020-Feb-26-17:47:47" #exp_descr = pd.read_csv("EXP_DESCR/" + date + ".csv") #print("Pretained: ", exp_descr['pretrained'][0]) #print("Bidirectional: ", exp_descr['Bidirectional'][0]) #epoch = str(10) #data_dir = 'data' # params = pd.read_csv("Parameters/params.csv") params = params.set_index('time') exp_descr = params.loc[date] # 2019-Dec-02-09:35:25, 60,300,256,0.3,0.5,16,False,0.001,10,False embedding_size = exp_descr["embedding_size"] hidden_size = exp_descr["hidden_size"] rnn_type = exp_descr['rnn_type'] word_dropout = exp_descr["word_dropout"] embedding_dropout = exp_descr["embedding_dropout"] latent_size = exp_descr["latent_size"] num_layers = 1 batch_size = exp_descr["batch_size"] bidirectional = bool(exp_descr["bidirectional"]) max_sequence_length = exp_descr["max_sequence_length"] back = exp_descr["back"] attribute_size = exp_descr["attr_size"] wd_type = exp_descr["word_drop_type"] num_samples = 2 save_model_path = 'bin' ptb = False if ptb == True: vocab_dir = '/ptb.vocab.json' else: vocab_dir = '/yelp_vocab.json' with open("bin/" + date + "/" + vocab_dir, 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] model = SentenceVAE(vocab_size=len(w2i), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], unk_idx=w2i['<unk>'], max_sequence_length=max_sequence_length, embedding_size=embedding_size, rnn_type=rnn_type, hidden_size=hidden_size, word_dropout=0, embedding_dropout=0, latent_size=latent_size, num_layers=num_layers, cuda=cuda2, bidirectional=bidirectional, attribute_size=attribute_size, word_dropout_type='static', back=back) print(model) # Results # 2019-Nov-28-13:23:06/E4-5".pytorch" load_checkpoint = "bin/" + date + "/" + "E" + str(epoch) + ".pytorch" # load_checkpoint = "bin/2019-Nov-28-12:03:44 /E0.pytorch" if not os.path.exists(load_checkpoint): raise FileNotFoundError(load_checkpoint) if torch.cuda.is_available(): model = model.cuda() device = "cuda" else: device = "cpu" model.load_state_dict( torch.load(load_checkpoint, map_location=torch.device(device))) def attr_generation(n): labels = np.random.randint(2, size=n) enc = OneHotEncoder(handle_unknown='ignore') labels = np.reshape(labels, (len(labels), 1)) enc.fit(labels) one_hot = enc.transform(labels).toarray() one_hot = one_hot.astype(np.float32) one_hot = torch.from_numpy(one_hot) return one_hot model.eval() labels = attr_generation(n=num_samples) from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from sklearn.metrics import accuracy_score analyser = SentimentIntensityAnalyzer() def sentiment_analyzer_scores(sentence): score = analyser.polarity_scores(sentence) if score['compound'] > 0.05: return 1, 'Positive' else: return 0, 'Negative' print('----------SAMPLES----------') labels = [] generated = [] for i in range(n_samples): samples, z, l = model.inference(sentiment) s = idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']) #print(sentiment_analyzer_scores(s[0])) if sentiment_analyzer_scores(s[0])[1] == sentiment: generated.append(s[0]) labels.append(sentiment_analyzer_scores(s[0])[0]) #print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') print(sum(labels)) translation = idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']) return generated '''
args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() with open(args.data_dir + '/ptb.vocab.json', 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] embedding = KeyedVectors.load('model/pretrained_embedding') weights = torch.FloatTensor(embedding.syn0) model = SentenceVAE(vocab_size=weights.size(0), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>']) model.load_state_dict(torch.load(args.load_checkpoint)) print("Model loaded from %s" % (args.load_checkpoint)) if torch.cuda.is_available(): model = model.cuda() model.eval() print('----------SAMPLES----------') for i in range(5): sample, z = model.inference() sample = sample.cpu().numpy() print(sample) print(idx2word(sample, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') datasets = OrderedDict()
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] + (['test'] if args.test else []) datasets = OrderedDict() for split in splits: datasets[split] = PTB(data_dir=args.data_dir, split=split, create_data=args.create_data, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ) with open(args.data_dir + '/ptb.vocab.json', 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] with open(os.path.join(args.save_model_path, 'model_params.json'), 'r') as f: params = json.load(f) model = SentenceVAE(**params) model.load_state_dict(torch.load(args.load_checkpoint)) print("Model loaded from %s" % args.load_checkpoint) if torch.cuda.is_available(): model = model.cuda() print(model) with torch.no_grad(): input_sent = "the n stock specialist firms on the big board floor the buyers and sellers of last resort who were criticized after the n crash once again could n't handle the selling pressure" batch_input = torch.LongTensor([[w2i[i] for i in input_sent.split()]]).cuda() batch_len = torch.LongTensor([len(input_sent.split())]).cuda() input_mean = model(batch_input, batch_len, output_mean=True) data_loader = DataLoader(dataset=datasets["train"], batch_size=args.batch_size, shuffle=False, num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) print('---------CALCULATING NEAREST SENTENCES--------') sim = [] all_sentences = [] for iteration, batch in enumerate(data_loader): for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) all_sentences.append(batch['input']) # Forward pass mean = model(batch['input'], batch['length'], output_mean=True) batch_sim = torch.abs(mean - input_mean) sim.append(batch_sim) sim = torch.cat(sim, dim=0) _, most_similar_per_dim = torch.topk(-sim, k=20, dim=0) most_similar_per_dim = most_similar_per_dim.transpose(0, 1) all_sentences = torch.cat(all_sentences, dim=0) for dim, i in enumerate(most_similar_per_dim): sentences = torch.index_select(all_sentences, dim=0, index=i) print(f"{dim=}") print(*idx2word(sentences, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
def main(args): with open(args.data_dir+'/ptb.vocab.json', 'r') as file: vocab = json.load(file) w2i, i2w = vocab['w2i'], vocab['i2w'] model = SentenceVAE( vocab_size=len(w2i), sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'], unk_idx=w2i['<unk>'], max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional ) if not os.path.exists(args.load_checkpoint): raise FileNotFoundError(args.load_checkpoint) model.load_state_dict(torch.load(args.load_checkpoint)) print("Model loaded from %s"%(args.load_checkpoint)) if torch.cuda.is_available(): model = model.cuda() model.eval() # samples, z = model.inference(n=args.num_samples) # print('----------SAMPLES----------') # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') # z1 = torch.randn([args.latent_size]).numpy() # z2 = torch.randn([args.latent_size]).numpy() # z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float()) # samples, _ = model.inference(z=z) # print('-------INTERPOLATION-------') # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') print('-------Encode ... Decode-------') datasets = PTB( data_dir=args.data_dir, split="valid", create_data=False, max_sequence_length=args.max_sequence_length, min_occ=1 ) data_loader = DataLoader(dataset=datasets, batch_size=2, shuffle='valid',num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) print("*"*10) print(*idx2word(batch['input'], i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') logp, mean, logv, z = model(batch['input'], batch['length']) print("+"*10) samples, z = model.inference(z=z) print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n') if iteration == 0: break