Esempio n. 1
0
def main(args):
    with open(args.data_dir + '/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(vocab_size=len(w2i),
                        sos_idx=w2i['<sos>'],
                        eos_idx=w2i['<eos>'],
                        pad_idx=w2i['<pad>'],
                        unk_idx=w2i['<unk>'],
                        max_sequence_length=args.max_sequence_length,
                        embedding_size=args.embedding_size,
                        rnn_type=args.rnn_type,
                        hidden_size=args.hidden_size,
                        word_dropout=args.word_dropout,
                        embedding_dropout=args.embedding_dropout,
                        latent_size=args.latent_size,
                        num_layers=args.num_layers,
                        bidirectional=args.bidirectional)

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s" % args.load_checkpoint)

    if torch.cuda.is_available():
        model = model.cuda()

    model.eval()

    # samples, z = model.inference(n=args.num_samples)
    # print('----------SAMPLES----------')
    # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    # z_ = torch.randn([args.latent_size]).numpy()
    # input_sent = "the n stock specialist firms on the big board floor the buyers and sellers of last resort who were criticized after the n crash once again could n't handle the selling pressure"
    input_sent = "looking for a job was one of the most anxious periods of my life and is for most people"
    batch_input = torch.LongTensor([[w2i[i]
                                     for i in input_sent.split()]]).cuda()
    batch_len = torch.LongTensor([len(input_sent.split())]).cuda()
    input_mean = model(batch_input, batch_len, output_mean=True)
    z_ = input_mean.cpu().detach().numpy()
    print(z_.shape)
    # z2 = torch.randn([args.latent_size]).numpy()
    for i in range(args.latent_size):
        print(f"-------Dimension {i}------")
        z1, z2 = z_.copy(), z_.copy()
        z1[i] -= 0.5
        z2[i] += 0.5
        z = to_var(
            torch.from_numpy(interpolate(start=z1, end=z2, steps=5)).float())
        samples, _ = model.inference(z=z)
        print('-------INTERPOLATION-------')
        print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
Esempio n. 2
0
def main(args):

    with open(args.data_dir + '/poems.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(vocab_size=len(w2i),
                        sos_idx=w2i['<sos>'],
                        eos_idx=w2i['<eos>'],
                        pad_idx=w2i['<pad>'],
                        unk_idx=w2i['<unk>'],
                        max_sequence_length=args.max_sequence_length,
                        embedding_size=args.embedding_size,
                        rnn_type=args.rnn_type,
                        hidden_size=args.hidden_size,
                        word_dropout=args.word_dropout,
                        embedding_dropout=args.embedding_dropout,
                        latent_size=args.latent_size,
                        num_layers=args.num_layers,
                        bidirectional=args.bidirectional,
                        condition_size=0)

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(
        torch.load(args.load_checkpoint, map_location=torch.device('cpu')))
    print("Model loaded from %s" % (args.load_checkpoint))

    if torch.cuda.is_available():
        model = model.cuda()

    model.eval()
    samples, z = model.inference(n=args.num_samples)
    print('----------SAMPLES----------')
    print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
    # while True:
    #     samples, z = model.inference(n=1, condition=torch.Tensor([[1, 0, 0, 0, 0, 0, 0]]).cuda())
    #     poem = idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>'])[0]
    #     if 'love' in poem:
    #         breakpoint()

    z1 = torch.randn([args.latent_size]).numpy()
    z2 = torch.randn([args.latent_size]).numpy()
    z = to_var(
        torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    # samples, _ = model.inference(z=z, condition=torch.Tensor([[1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0]]).cuda())
    samples, _ = model.inference(z=z)
    print('-------INTERPOLATION-------')
    print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
Esempio n. 3
0
def main(args):

    with open(args.data_dir+'/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
        )

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s"%(args.load_checkpoint))

    if torch.cuda.is_available():
        model = model.cuda()
    
    model.eval()

#     samples, z = model.inference(n=args.num_samples)
#     print('----------SAMPLES----------')
#     print(idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']))

    z1 = torch.randn([args.latent_size]).numpy()
    z2 = torch.randn([args.latent_size]).numpy()
    z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    samples, _ = model.inference(z=z)
    print('-------INTERPOLATION-------')
    print(idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']))
    
    model.load_state_dict(torch.load('bin/2019-May-16-04:24:16/E10.pytorch'))
    z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    samples, _ = model.inference(z=z)
    print('-------INTERPOLATION-------')
    print(idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']))
Esempio n. 4
0
def load_vae_model_from_args(args):
    with open(args.data_dir+'/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
        )
    tokenizer = DefaultTokenizer()

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s" % args.load_checkpoint)

    if torch.cuda.is_available():
        model = model.cuda()

    model.eval()
    return {
      'model': model,
      'tokenizer': tokenizer,
      'w2i': w2i,
      'i2w': i2w,
    }
def main(args):

    data_name = args.data_name
    with open(args.data_dir+data_name+'.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
        )

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s"%(args.load_checkpoint))

    if torch.cuda.is_available():
        model = model.cuda()
    
    model.eval()

    samples, z = model.inference(n=args.num_samples)
    print('----------SAMPLES----------')
    print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
    
    z1 = torch.randn([args.latent_size]).numpy()
    z2 = torch.randn([args.latent_size]).numpy()
    z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    samples, _ = model.inference(z=z)
    print('-------INTERPOLATION-------')
    print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    print('-------Encode ... Decode-------')
    
    datasets = Amazon(
            data_dir=args.data_dir,
            split="valid",
            create_data=False,
            batch_size=10,
            max_sequence_length=args.max_sequence_length,
            min_occ=3
        )

    iteration = 0
    for input_batch_tensor, target_batch_tensor, length_batch_tensor in datasets:
        if torch.is_tensor(input_batch_tensor):
            input_batch_tensor = to_var(input_batch_tensor)

        if torch.is_tensor(target_batch_tensor):
            target_batch_tensor = to_var(target_batch_tensor)

        if torch.is_tensor(length_batch_tensor):
            length_batch_tensor = to_var(length_batch_tensor)

        print("*"*10)
        print("->"*10, *idx2word(input_batch_tensor, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
        logp, mean, logv, z = model(input_batch_tensor,length_batch_tensor)

        
        samples, z = model.inference(z=z)
        print("<-"*10, *idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
        # print("+"*10)
        if iteration == 0:
            break

        iteration += 1
Esempio n. 6
0
def main(args):

    with open(args.data_dir+'/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    # required to map between integer-value sentences and real sentences
    w2i, i2w = vocab['w2i'], vocab['i2w']

    # make sure our models for the VAE and Actor exist
    if not os.path.exists(args.load_vae):
        raise FileNotFoundError(args.load_vae)

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
    )

    model.load_state_dict(
        torch.load(args.load_vae, map_location=lambda storage, loc: storage))
    model.eval()
    print("vae model loaded from %s"%(args.load_vae))

    # to run in constraint mode, we need the trained generator
    if args.constraint_mode:
        if not os.path.exists(args.load_actor):
            raise FileNotFoundError(args.load_actor)

        actor = Actor(
            dim_z=args.latent_size, dim_model=2048, num_labels=args.n_tags)
        actor.load_state_dict(
            torch.load(args.load_actor, map_location=lambda storage, loc:storage))
        actor.eval()
        print("actor model loaded from %s"%(args.load_actor))

    if torch.cuda.is_available():
        model = model.cuda()
        if args.constraint_mode:
            actor = actor.cuda() # TODO: to(self.devices)

    if args.sample:
        print('*** SAMPLE Z: ***')
        # get samples from the prior
        sample_sents, z = model.inference(n=args.num_samples)
        sample_sents, sample_tags = get_sents_and_tags(sample_sents, i2w, w2i)
        pickle_it(z.cpu().numpy(), 'samples/z_sample_n{}.pkl'.format(args.num_samples))
        pickle_it(sample_sents, 'samples/sents_sample_n{}.pkl'.format(args.num_samples))
        pickle_it(sample_tags, 'samples/tags_sample_n{}.pkl'.format(args.num_samples))
        print(sample_sents, sep='\n')

        if args.constraint_mode:

            print('*** SAMPLE Z_PRIME: ***')
            # get samples from the prior, conditioned via the actor
            all_tags_sample_prime = []
            all_sents_sample_prime = {}
            all_z_sample_prime = {}
            for i, condition in enumerate(LABELS):

                # binary vector denoting each of the PHRASE_TAGS
                labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda()

                # take z and manipulate using the actor to generate z_prime
                z_prime = actor.forward(z, labels)

                sample_sents_prime, z_prime = model.inference(
                    z=z_prime, n=args.num_samples)
                sample_sents_prime, sample_tags_prime = get_sents_and_tags(
                    sample_sents_prime, i2w, w2i)
                print('conditoned on: {}'.format(condition))
                print(sample_sents_prime, sep='\n')
                all_tags_sample_prime.append(sample_tags_prime)
                all_sents_sample_prime[LABEL_NAMES[i]] = sample_sents_prime
                all_z_sample_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy()
            pickle_it(all_tags_sample_prime, 'samples/tags_sample_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_sents_sample_prime, 'samples/sents_sample_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_z_sample_prime, 'samples/z_sample_prime_n{}.pkl'.format(args.num_samples))

    if args.interpolate:
        # get random samples from the latent space
        z1 = torch.randn([args.latent_size]).numpy()
        z2 = torch.randn([args.latent_size]).numpy()
        z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=args.num_samples-2)).float())

        print('*** INTERP Z: ***')
        interp_sents, _ = model.inference(z=z)
        interp_sents, interp_tags = get_sents_and_tags(interp_sents, i2w, w2i)
        pickle_it(z.cpu().numpy(), 'samples/z_interp_n{}.pkl'.format(args.num_samples))
        pickle_it(interp_sents, 'samples/sents_interp_n{}.pkl'.format(args.num_samples))
        pickle_it(interp_tags, 'samples/tags_interp_n{}.pkl'.format(args.num_samples))
        print(interp_sents, sep='\n')

        if args.constraint_mode:
            print('*** INTERP Z_PRIME: ***')
            all_tags_interp_prime = []
            all_sents_interp_prime = {}
            all_z_interp_prime = {}

            for i, condition in enumerate(LABELS):

                # binary vector denoting each of the PHRASE_TAGS
                labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda()

                # z prime conditioned on this particular binary variable
                z_prime = actor.forward(z, labels)

                interp_sents_prime, z_prime = model.inference(
                    z=z_prime, n=args.num_samples)
                interp_sents_prime, interp_tags_prime = get_sents_and_tags(
                    interp_sents_prime, i2w, w2i)
                print('conditoned on: {}'.format(condition))
                print(interp_sents_prime, sep='\n')
                all_tags_interp_prime.append(interp_tags_prime)
                all_sents_interp_prime[LABEL_NAMES[i]] = interp_sents_prime
                all_z_interp_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy()

            pickle_it(all_tags_interp_prime, 'samples/tags_interp_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_sents_interp_prime, 'samples/sents_interp_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_z_interp_prime, 'samples/z_interp_prime_n{}.pkl'.format(args.num_samples))

    import IPython; IPython.embed()
Esempio n. 7
0
def main(arguments):
    parser = argparse.ArgumentParser(description=__doc__,
                    formatter_class=argparse.RawDescriptionHelpFormatter)

    # Logistics
    parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0)
    parser.add_argument("--seed", help="Random seed", type=int, default=19)
    parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1)
    parser.add_argument("--out_dir", help="Dir to write preds to", type=str, default='')
    parser.add_argument("--log_file", help="File to log to", type=str)
    parser.add_argument("--load_data", help="0 to read data from scratch", type=int, default=1)

    # Task options
    parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str)
    parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40)

    # Model options
    parser.add_argument("--ckpt_path", help="Path to ckpt to load", type=str,
                        default=PATH_PREFIX + 'ckpts/svae/glue_svae/best.mdl')
    parser.add_argument("--vocab_path", help="Path to vocab to use", type=str,
                        default=PATH_PREFIX + 'processed_data/svae/glue_v2/vocab.json')
    parser.add_argument("--model", help="Word emb dim", type=str, default='vae')
    parser.add_argument("--embedding_size", help="Word emb dim", type=int, default=300)
    parser.add_argument("--word_dropout", help="Word emb dim", type=float, default=0.5)
    parser.add_argument("--hidden_size", help="RNN size", type=int, default=512)
    parser.add_argument("--latent_size", help="Latent vector dim", type=int, default=16)
    parser.add_argument("--num_layers", help="Number of encoder layers", type=int, default=1)
    parser.add_argument("--bidirectional", help="1 for bidirectional", type=bool, default=False)
    parser.add_argument("--rnn_type", help="Type of rnn", type=str, choices=['rnn', 'gru'],
                        default='gru')
    parser.add_argument("--batch_size", help="Batch size to use", type=int, default=64)

    # Classifier options
    parser.add_argument("--cls_batch_size", help="Batch size to use", type=int, default=64)

    args = parser.parse_args(arguments)
    logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
    if args.log_file:
        fileHandler = logging.FileHandler(args.log_file)
        logging.getLogger().addHandler(fileHandler)
    logging.info(args)

    # define senteval params
    params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': args.use_pytorch, 'kfold': 10,
            'max_seq_len': args.max_seq_len, 'batch_size': args.batch_size, 'load_data': args.load_data,
            'seed': args.seed}
    params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': args.cls_batch_size,
            'tenacity': 5, 'epoch_size': 4, 'cudaEfficient': True}

    # Load InferSent model
    vocab = json.load(open(args.vocab_path, 'r'))
    args.denoise = False
    args.prob_swap, args.prob_drop = 0.0, 0.0
    if args.model == 'vae':
        model = SentenceVAE(args, vocab['w2i'],
                            #sos_idx=w2i['<sos>'], eos_idx=w2i['<eos>'], pad_idx=w2i['<pad>'],
                            #max_sequence_length=args.max_seq_len,
                            embedding_size=args.embedding_size,
                            rnn_type=args.rnn_type, hidden_size=args.hidden_size,
                            word_dropout=args.word_dropout, latent_size=args.latent_size,
                            num_layers=args.num_layers, bidirectional=args.bidirectional)
    elif args.model == 'ae':
        model = SentenceAE(args, vocab['w2i'],
                           embedding_size=args.embedding_size,
                           rnn_type=args.rnn_type, hidden_size=args.hidden_size,
                           word_dropout=args.word_dropout, latent_size=args.latent_size,
                           num_layers=args.num_layers, bidirectional=args.bidirectional)

    model.load_state_dict(torch.load(args.ckpt_path))
    model = model.cuda()
    model.eval()
    params_senteval['model'] = model

    # Do SentEval stuff
    se = senteval.engine.SE(params_senteval, batcher, prepare)
    tasks = get_tasks(args.tasks)
    results = se.eval(tasks)
    if args.out_dir:
        write_results(results, args.out_dir)
    if not args.log_file:
        print(results)
    else:
        logging.info(results)
Esempio n. 8
0
def main(args):

    data_name = args.data_name
    with open(args.data_dir+data_name+'.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
        )

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s"%(args.load_checkpoint))

    if torch.cuda.is_available():
        model = model.cuda()
    
    model.eval()

    # samples, z = model.inference(n=args.num_samples)
    # print('----------SAMPLES----------')
    # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
    
    # z1 = torch.randn([args.latent_size]).numpy()
    # z2 = torch.randn([args.latent_size]).numpy()
    # z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    # samples, _ = model.inference(z=z)
    # print('-------INTERPOLATION-------')
    # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    # print('-------Encode ... Decode-------')
    
    # datasets = Amazon(
    #         data_dir=args.data_dir,
    #         split="valid",
    #         create_data=False,
    #         batch_size=10,
    #         max_sequence_length=args.max_sequence_length,
    #         min_occ=3
    #     )


    ### load vocab
    # with open(os.path.join(args.data_dir, args.vocab_file), 'r') as file:
    #     vocab = json.load(file)
    #     w2i, i2w = vocab['w2i'], vocab['i2w']

    tokenizer = TweetTokenizer(preserve_case=False)

    # raw_text = "I like this!"
    raw_text = "DON'T CARE FOR IT.  GAVE IT AS A GIFT AND THEY WERE OKAY WITH IT.  JUST NOT WHAT I EXPECTED."
    input_text = f_raw2vec(tokenizer, raw_text, w2i, i2w)
    length_text = len(input_text)
    length_text = [length_text]
    print("length_text", length_text)

    input_tensor = torch.LongTensor(input_text)
    print('input_tensor', input_tensor)
    input_tensor = input_tensor.unsqueeze(0)
    if torch.is_tensor(input_tensor):
        input_tensor = to_var(input_tensor)

    length_tensor = torch.LongTensor(length_text)
    print("length_tensor", length_tensor)
    # length_tensor = length_tensor.unsqueeze(0)
    if torch.is_tensor(length_tensor):
        length_tensor = to_var(length_tensor)
    
    print("*"*10)
    print("->"*10, *idx2word(input_tensor, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
    logp, mean, logv, z = model(input_tensor, length_tensor)

    # print("z", z.size(), mean_z.size())
    mean = mean.unsqueeze(0)
    print("mean", mean)
    print("z", z)

    samples, z = model.inference(z=mean)
    print("<-"*10, *idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    for i in range(10):
        samples, z = model.inference(z=z)
        print("<-"*10, *idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
Esempio n. 9
0
def main(args):

    ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime())

    splits = ['train', 'valid'] + (['test'] if args.test else [])

    datasets = OrderedDict()
    for split in splits:
        datasets[split] = PTB(
            data_dir=args.data_dir,
            split=split,
            create_data=args.create_data,
            max_sequence_length=args.max_sequence_length,
            min_occ=args.min_occ
        )

    model = SentenceVAE(
        vocab_size=datasets['train'].vocab_size,
        sos_idx=datasets['train'].sos_idx,
        eos_idx=datasets['train'].eos_idx,
        pad_idx=datasets['train'].pad_idx,
        unk_idx=datasets['train'].unk_idx,
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
        )

    if torch.cuda.is_available():
        model = model.cuda()
        tensor = torch.cuda.FloatTensor 
    else:
        tensor = torch.Tensor

    if args.load_checkpoint != 'None':
        if not os.path.exists(args.load_checkpoint):
            raise FileNotFoundError(args.load_checkpoint)
        model.load_state_dict(torch.load(args.load_checkpoint))


    latent_vector = defaultdict(tensor)

    print(model)

    if args.tensorboard_logging:
        writer = SummaryWriter(os.path.join(args.logdir, expierment_name(args,ts)))
        writer.add_text("model", str(model))
        writer.add_text("args", str(args))
        writer.add_text("ts", ts)

    if args.file_logging:
        log_file = open(os.path.join(args.logdir, expierment_name(args,ts)+"_logfile.txt"), "w")


    save_model_path = os.path.join(args.save_model_path, ts)
    os.makedirs(save_model_path)

    def kl_anneal_function(anneal_function, step, k, x0):
        if anneal_function == 'logistic':
            return float(1/(1+np.exp(-k*(step-x0))))
        elif anneal_function == 'linear':
            return min(1, step/x0)

    NLL = torch.nn.NLLLoss(size_average=False, ignore_index=datasets['train'].pad_idx)
    def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0):

        # cut-off unnecessary padding from target, and flatten
        target = target[:, :torch.max(length).data[0]].contiguous().view(-1)
        logp = logp.view(-1, logp.size(2))
        
        # Negative Log Likelihood
        NLL_loss = NLL(logp, target)

        # KL Divergence
        KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp())
        KL_weight = kl_anneal_function(anneal_function, step, k, x0)

        return NLL_loss, KL_loss, KL_weight

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor
    step = 0
    for epoch in range(args.epochs):

        for split in splits:

            data_loader = DataLoader(
                dataset=datasets[split],
                batch_size=args.batch_size,
                shuffle=split=='train',
                num_workers=cpu_count(),
                pin_memory=torch.cuda.is_available()
            )

            tracker = defaultdict(tensor)

            # Enable/Disable Dropout
            if split == 'train':
                fix_kl_weight = False
                model.train()
            else:
                fix_kl_weight = True
                model.eval()

            for iteration, batch in enumerate(data_loader):

                batch_size = batch['input'].size(0)

                for k, v in batch.items():
                    if torch.is_tensor(v):
                        batch[k] = to_var(v)

                # Forward pass
                logp, mean, logv, z = model(batch['input'], batch['length'])

                if split!='train':
                    latent_vector['latent'] = torch.cat((latent_vector['latent'], z.data))

                # loss calculation
                NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'],
                    batch['length'], mean, logv, args.anneal_function, step, args.k, args.x0)

                if fix_kl_weight:
                    KL_weight = 1

                loss = (NLL_loss + KL_weight * KL_loss)/batch_size

                # backward + optimization
                if split == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    step += 1


                # bookkeepeing
                tracker['ELBO'] = torch.cat((tracker['ELBO'], torch.cuda.FloatTensor([loss.data])))

                if args.tensorboard_logging:
                    writer.add_scalar("%s/ELBO"%split.upper(), loss.data[0], epoch*len(data_loader) + iteration)
                    writer.add_scalar("%s/NLL Loss"%split.upper(), NLL_loss.data[0]/batch_size, epoch*len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Loss"%split.upper(), KL_loss.data[0]/batch_size, epoch*len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Weight"%split.upper(), KL_weight, epoch*len(data_loader) + iteration)

                if iteration % args.print_every == 0 or iteration+1 == len(data_loader):
                    print("%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f"
                        %(split.upper(), iteration, len(data_loader)-1, loss.data[0], NLL_loss.data[0]/batch_size, KL_loss.data[0]/batch_size, KL_weight))
                    if args.file_logging:
                        log_file.write("%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f \n"
                            %(split.upper(), iteration, len(data_loader)-1, loss.data[0], NLL_loss.data[0]/batch_size, KL_loss.data[0]/batch_size, KL_weight))

                if split == 'valid':
                    if 'target_sents' not in tracker:
                        tracker['target_sents'] = list()
                    tracker['target_sents'] += idx2word(batch['target'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx)
                    tracker['z'] = torch.cat((tracker['z'], z.data), dim=0)

            print("%s Epoch %02d/%i, Mean ELBO %9.4f"%(split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO'])))

            if args.file_logging:
                log_file.write("%s Epoch %02d/%i, Mean ELBO %9.4f \n" %(split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO'])))

            if args.tensorboard_logging:
                writer.add_scalar("%s-Epoch/ELBO"%split.upper(), torch.mean(tracker['ELBO']), epoch)

            # save a dump of all sentences and the encoded latent space
            if split == 'valid':
                dump = {'target_sents':tracker['target_sents'], 'z':tracker['z'].tolist()}
                if not os.path.exists(os.path.join('dumps', ts)):
                    os.makedirs('dumps/'+ts)
                with open(os.path.join('dumps/'+ts+'/valid_E%i.json'%epoch), 'w') as dump_file:
                    json.dump(dump,dump_file)

            # save checkpoint
            if split == 'train':
                checkpoint_path = os.path.join(save_model_path, "E%i.pytorch"%(epoch))
                torch.save(model.state_dict(), checkpoint_path)
                print("Model saved at %s"%checkpoint_path)

        torch.save(latent_vector['latent'], '.latent_vector_{}.pt'.format(epoch))

    if args.file_logging:
        log_file.close()
Esempio n. 10
0
def generate(date, epoch, sentiment, n_samples):
    date = date
    cuda2 = torch.device('cuda:0')
    epoch = epoch
    #date = "2020-Feb-26-17:47:47"
    #exp_descr = pd.read_csv("EXP_DESCR/" + date + ".csv")
    #print("Pretained: ", exp_descr['pretrained'][0])
    #print("Bidirectional: ", exp_descr['Bidirectional'][0])
    #epoch = str(10)
    #data_dir = 'data'
    #

    params = pd.read_csv("Parameters/params.csv")
    params = params.set_index('time')
    exp_descr = params.loc[date]
    # 2019-Dec-02-09:35:25, 60,300,256,0.3,0.5,16,False,0.001,10,False

    embedding_size = exp_descr["embedding_size"]
    hidden_size = exp_descr["hidden_size"]
    rnn_type = exp_descr['rnn_type']
    word_dropout = exp_descr["word_dropout"]
    embedding_dropout = exp_descr["embedding_dropout"]
    latent_size = exp_descr["latent_size"]
    num_layers = 1
    batch_size = exp_descr["batch_size"]
    bidirectional = bool(exp_descr["bidirectional"])
    max_sequence_length = exp_descr["max_sequence_length"]
    back = exp_descr["back"]
    attribute_size = exp_descr["attr_size"]
    wd_type = exp_descr["word_drop_type"]
    num_samples = 2
    save_model_path = 'bin'
    ptb = False
    if ptb == True:
        vocab_dir = '/ptb.vocab.json'
    else:
        vocab_dir = '/yelp_vocab.json'

    with open("bin/" + date + "/" + vocab_dir, 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(vocab_size=len(w2i),
                        sos_idx=w2i['<sos>'],
                        eos_idx=w2i['<eos>'],
                        pad_idx=w2i['<pad>'],
                        unk_idx=w2i['<unk>'],
                        max_sequence_length=max_sequence_length,
                        embedding_size=embedding_size,
                        rnn_type=rnn_type,
                        hidden_size=hidden_size,
                        word_dropout=0,
                        embedding_dropout=0,
                        latent_size=latent_size,
                        num_layers=num_layers,
                        cuda=cuda2,
                        bidirectional=bidirectional,
                        attribute_size=attribute_size,
                        word_dropout_type='static',
                        back=back)

    print(model)
    # Results
    # 2019-Nov-28-13:23:06/E4-5".pytorch"

    load_checkpoint = "bin/" + date + "/" + "E" + str(epoch) + ".pytorch"
    # load_checkpoint = "bin/2019-Nov-28-12:03:44 /E0.pytorch"

    if not os.path.exists(load_checkpoint):
        raise FileNotFoundError(load_checkpoint)

    if torch.cuda.is_available():
        model = model.cuda()
        device = "cuda"
    else:
        device = "cpu"

    model.load_state_dict(
        torch.load(load_checkpoint, map_location=torch.device(device)))

    def attr_generation(n):
        labels = np.random.randint(2, size=n)
        enc = OneHotEncoder(handle_unknown='ignore')
        labels = np.reshape(labels, (len(labels), 1))
        enc.fit(labels)
        one_hot = enc.transform(labels).toarray()
        one_hot = one_hot.astype(np.float32)
        one_hot = torch.from_numpy(one_hot)
        return one_hot

    model.eval()
    labels = attr_generation(n=num_samples)

    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    from sklearn.metrics import accuracy_score
    analyser = SentimentIntensityAnalyzer()

    def sentiment_analyzer_scores(sentence):
        score = analyser.polarity_scores(sentence)
        if score['compound'] > 0.05:
            return 1, 'Positive'
        else:
            return 0, 'Negative'

    print('----------SAMPLES----------')
    labels = []
    generated = []
    for i in range(n_samples):
        samples, z, l = model.inference(sentiment)
        s = idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>'])
        #print(sentiment_analyzer_scores(s[0]))
        if sentiment_analyzer_scores(s[0])[1] == sentiment:
            generated.append(s[0])

        labels.append(sentiment_analyzer_scores(s[0])[0])
        #print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
    print(sum(labels))
    translation = idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>'])
    return generated
    '''
Esempio n. 11
0
args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

with open(args.data_dir + '/ptb.vocab.json', 'r') as file:
    vocab = json.load(file)

w2i, i2w = vocab['w2i'], vocab['i2w']
embedding = KeyedVectors.load('model/pretrained_embedding')
weights = torch.FloatTensor(embedding.syn0)

model = SentenceVAE(vocab_size=weights.size(0),
                    sos_idx=w2i['<sos>'],
                    eos_idx=w2i['<eos>'],
                    pad_idx=w2i['<pad>'])

model.load_state_dict(torch.load(args.load_checkpoint))
print("Model loaded from %s" % (args.load_checkpoint))

if torch.cuda.is_available():
    model = model.cuda()

model.eval()

print('----------SAMPLES----------')
for i in range(5):
    sample, z = model.inference()
    sample = sample.cpu().numpy()
    print(sample)
    print(idx2word(sample, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

datasets = OrderedDict()
Esempio n. 12
0
def main(args):
    ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime())

    splits = ['train', 'valid'] + (['test'] if args.test else [])

    datasets = OrderedDict()
    for split in splits:
        datasets[split] = PTB(data_dir=args.data_dir,
                              split=split,
                              create_data=args.create_data,
                              max_sequence_length=args.max_sequence_length,
                              min_occ=args.min_occ)

    with open(args.data_dir + '/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    with open(os.path.join(args.save_model_path, 'model_params.json'),
              'r') as f:
        params = json.load(f)
    model = SentenceVAE(**params)
    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s" % args.load_checkpoint)

    if torch.cuda.is_available():
        model = model.cuda()

    print(model)

    with torch.no_grad():

        input_sent = "the n stock specialist firms on the big board floor the buyers and sellers of last resort who were criticized after the n crash once again could n't handle the selling pressure"
        batch_input = torch.LongTensor([[w2i[i]
                                         for i in input_sent.split()]]).cuda()
        batch_len = torch.LongTensor([len(input_sent.split())]).cuda()
        input_mean = model(batch_input, batch_len, output_mean=True)

        data_loader = DataLoader(dataset=datasets["train"],
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=cpu_count(),
                                 pin_memory=torch.cuda.is_available())

        print('---------CALCULATING NEAREST SENTENCES--------')

        sim = []
        all_sentences = []
        for iteration, batch in enumerate(data_loader):

            for k, v in batch.items():
                if torch.is_tensor(v):
                    batch[k] = to_var(v)

            all_sentences.append(batch['input'])

            # Forward pass
            mean = model(batch['input'], batch['length'], output_mean=True)
            batch_sim = torch.abs(mean - input_mean)
            sim.append(batch_sim)
        sim = torch.cat(sim, dim=0)
        _, most_similar_per_dim = torch.topk(-sim, k=20, dim=0)
        most_similar_per_dim = most_similar_per_dim.transpose(0, 1)
        all_sentences = torch.cat(all_sentences, dim=0)
        for dim, i in enumerate(most_similar_per_dim):
            sentences = torch.index_select(all_sentences, dim=0, index=i)
            print(f"{dim=}")
            print(*idx2word(sentences, i2w=i2w, pad_idx=w2i['<pad>']),
                  sep='\n')
Esempio n. 13
0
def main(args):

    with open(args.data_dir+'/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
        )

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s"%(args.load_checkpoint))

    if torch.cuda.is_available():
        model = model.cuda()
    
    model.eval()

    # samples, z = model.inference(n=args.num_samples)
    # print('----------SAMPLES----------')
    # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    # z1 = torch.randn([args.latent_size]).numpy()
    # z2 = torch.randn([args.latent_size]).numpy()
    # z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    # samples, _ = model.inference(z=z)
    # print('-------INTERPOLATION-------')
    # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    print('-------Encode ... Decode-------')
    
    datasets = PTB(
            data_dir=args.data_dir,
            split="valid",
            create_data=False,
            max_sequence_length=args.max_sequence_length,
            min_occ=1
        )
    data_loader = DataLoader(dataset=datasets, batch_size=2, shuffle='valid',num_workers=cpu_count(), pin_memory=torch.cuda.is_available())

    for iteration, batch in enumerate(data_loader):
        batch_size = batch['input'].size(0)
        for k, v in batch.items():
            if torch.is_tensor(v):
                batch[k] = to_var(v)

        print("*"*10)
        print(*idx2word(batch['input'], i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
        logp, mean, logv, z = model(batch['input'], batch['length'])

        print("+"*10)
        samples, z = model.inference(z=z)
        print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

        
        if iteration == 0:
            break