Esempio n. 1
0
def main(args):

    with open(args.data_dir+'/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
        )

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s"%(args.load_checkpoint))

    if torch.cuda.is_available():
        model = model.cuda()
    
    model.eval()

#     samples, z = model.inference(n=args.num_samples)
#     print('----------SAMPLES----------')
#     print(idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']))

    z1 = torch.randn([args.latent_size]).numpy()
    z2 = torch.randn([args.latent_size]).numpy()
    z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    samples, _ = model.inference(z=z)
    print('-------INTERPOLATION-------')
    print(idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']))
    
    model.load_state_dict(torch.load('bin/2019-May-16-04:24:16/E10.pytorch'))
    z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    samples, _ = model.inference(z=z)
    print('-------INTERPOLATION-------')
    print(idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']))
Esempio n. 2
0
def main(args):

    with open(args.data_dir + '/poems.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(vocab_size=len(w2i),
                        sos_idx=w2i['<sos>'],
                        eos_idx=w2i['<eos>'],
                        pad_idx=w2i['<pad>'],
                        unk_idx=w2i['<unk>'],
                        max_sequence_length=args.max_sequence_length,
                        embedding_size=args.embedding_size,
                        rnn_type=args.rnn_type,
                        hidden_size=args.hidden_size,
                        word_dropout=args.word_dropout,
                        embedding_dropout=args.embedding_dropout,
                        latent_size=args.latent_size,
                        num_layers=args.num_layers,
                        bidirectional=args.bidirectional,
                        condition_size=0)

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(
        torch.load(args.load_checkpoint, map_location=torch.device('cpu')))
    print("Model loaded from %s" % (args.load_checkpoint))

    if torch.cuda.is_available():
        model = model.cuda()

    model.eval()
    samples, z = model.inference(n=args.num_samples)
    print('----------SAMPLES----------')
    print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
    # while True:
    #     samples, z = model.inference(n=1, condition=torch.Tensor([[1, 0, 0, 0, 0, 0, 0]]).cuda())
    #     poem = idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>'])[0]
    #     if 'love' in poem:
    #         breakpoint()

    z1 = torch.randn([args.latent_size]).numpy()
    z2 = torch.randn([args.latent_size]).numpy()
    z = to_var(
        torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    # samples, _ = model.inference(z=z, condition=torch.Tensor([[1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0]]).cuda())
    samples, _ = model.inference(z=z)
    print('-------INTERPOLATION-------')
    print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
Esempio n. 3
0
def main(args):
    with open(args.data_dir + '/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(vocab_size=len(w2i),
                        sos_idx=w2i['<sos>'],
                        eos_idx=w2i['<eos>'],
                        pad_idx=w2i['<pad>'],
                        unk_idx=w2i['<unk>'],
                        max_sequence_length=args.max_sequence_length,
                        embedding_size=args.embedding_size,
                        rnn_type=args.rnn_type,
                        hidden_size=args.hidden_size,
                        word_dropout=args.word_dropout,
                        embedding_dropout=args.embedding_dropout,
                        latent_size=args.latent_size,
                        num_layers=args.num_layers,
                        bidirectional=args.bidirectional)

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s" % args.load_checkpoint)

    if torch.cuda.is_available():
        model = model.cuda()

    model.eval()

    # samples, z = model.inference(n=args.num_samples)
    # print('----------SAMPLES----------')
    # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    # z_ = torch.randn([args.latent_size]).numpy()
    # input_sent = "the n stock specialist firms on the big board floor the buyers and sellers of last resort who were criticized after the n crash once again could n't handle the selling pressure"
    input_sent = "looking for a job was one of the most anxious periods of my life and is for most people"
    batch_input = torch.LongTensor([[w2i[i]
                                     for i in input_sent.split()]]).cuda()
    batch_len = torch.LongTensor([len(input_sent.split())]).cuda()
    input_mean = model(batch_input, batch_len, output_mean=True)
    z_ = input_mean.cpu().detach().numpy()
    print(z_.shape)
    # z2 = torch.randn([args.latent_size]).numpy()
    for i in range(args.latent_size):
        print(f"-------Dimension {i}------")
        z1, z2 = z_.copy(), z_.copy()
        z1[i] -= 0.5
        z2[i] += 0.5
        z = to_var(
            torch.from_numpy(interpolate(start=z1, end=z2, steps=5)).float())
        samples, _ = model.inference(z=z)
        print('-------INTERPOLATION-------')
        print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
def main(args):

    data_name = args.data_name
    with open(args.data_dir+data_name+'.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
        )

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s"%(args.load_checkpoint))

    if torch.cuda.is_available():
        model = model.cuda()
    
    model.eval()

    samples, z = model.inference(n=args.num_samples)
    print('----------SAMPLES----------')
    print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
    
    z1 = torch.randn([args.latent_size]).numpy()
    z2 = torch.randn([args.latent_size]).numpy()
    z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    samples, _ = model.inference(z=z)
    print('-------INTERPOLATION-------')
    print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    print('-------Encode ... Decode-------')
    
    datasets = Amazon(
            data_dir=args.data_dir,
            split="valid",
            create_data=False,
            batch_size=10,
            max_sequence_length=args.max_sequence_length,
            min_occ=3
        )

    iteration = 0
    for input_batch_tensor, target_batch_tensor, length_batch_tensor in datasets:
        if torch.is_tensor(input_batch_tensor):
            input_batch_tensor = to_var(input_batch_tensor)

        if torch.is_tensor(target_batch_tensor):
            target_batch_tensor = to_var(target_batch_tensor)

        if torch.is_tensor(length_batch_tensor):
            length_batch_tensor = to_var(length_batch_tensor)

        print("*"*10)
        print("->"*10, *idx2word(input_batch_tensor, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
        logp, mean, logv, z = model(input_batch_tensor,length_batch_tensor)

        
        samples, z = model.inference(z=z)
        print("<-"*10, *idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
        # print("+"*10)
        if iteration == 0:
            break

        iteration += 1
Esempio n. 5
0
def main(args):

    with open(args.data_dir+'/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    # required to map between integer-value sentences and real sentences
    w2i, i2w = vocab['w2i'], vocab['i2w']

    # make sure our models for the VAE and Actor exist
    if not os.path.exists(args.load_vae):
        raise FileNotFoundError(args.load_vae)

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
    )

    model.load_state_dict(
        torch.load(args.load_vae, map_location=lambda storage, loc: storage))
    model.eval()
    print("vae model loaded from %s"%(args.load_vae))

    # to run in constraint mode, we need the trained generator
    if args.constraint_mode:
        if not os.path.exists(args.load_actor):
            raise FileNotFoundError(args.load_actor)

        actor = Actor(
            dim_z=args.latent_size, dim_model=2048, num_labels=args.n_tags)
        actor.load_state_dict(
            torch.load(args.load_actor, map_location=lambda storage, loc:storage))
        actor.eval()
        print("actor model loaded from %s"%(args.load_actor))

    if torch.cuda.is_available():
        model = model.cuda()
        if args.constraint_mode:
            actor = actor.cuda() # TODO: to(self.devices)

    if args.sample:
        print('*** SAMPLE Z: ***')
        # get samples from the prior
        sample_sents, z = model.inference(n=args.num_samples)
        sample_sents, sample_tags = get_sents_and_tags(sample_sents, i2w, w2i)
        pickle_it(z.cpu().numpy(), 'samples/z_sample_n{}.pkl'.format(args.num_samples))
        pickle_it(sample_sents, 'samples/sents_sample_n{}.pkl'.format(args.num_samples))
        pickle_it(sample_tags, 'samples/tags_sample_n{}.pkl'.format(args.num_samples))
        print(sample_sents, sep='\n')

        if args.constraint_mode:

            print('*** SAMPLE Z_PRIME: ***')
            # get samples from the prior, conditioned via the actor
            all_tags_sample_prime = []
            all_sents_sample_prime = {}
            all_z_sample_prime = {}
            for i, condition in enumerate(LABELS):

                # binary vector denoting each of the PHRASE_TAGS
                labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda()

                # take z and manipulate using the actor to generate z_prime
                z_prime = actor.forward(z, labels)

                sample_sents_prime, z_prime = model.inference(
                    z=z_prime, n=args.num_samples)
                sample_sents_prime, sample_tags_prime = get_sents_and_tags(
                    sample_sents_prime, i2w, w2i)
                print('conditoned on: {}'.format(condition))
                print(sample_sents_prime, sep='\n')
                all_tags_sample_prime.append(sample_tags_prime)
                all_sents_sample_prime[LABEL_NAMES[i]] = sample_sents_prime
                all_z_sample_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy()
            pickle_it(all_tags_sample_prime, 'samples/tags_sample_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_sents_sample_prime, 'samples/sents_sample_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_z_sample_prime, 'samples/z_sample_prime_n{}.pkl'.format(args.num_samples))

    if args.interpolate:
        # get random samples from the latent space
        z1 = torch.randn([args.latent_size]).numpy()
        z2 = torch.randn([args.latent_size]).numpy()
        z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=args.num_samples-2)).float())

        print('*** INTERP Z: ***')
        interp_sents, _ = model.inference(z=z)
        interp_sents, interp_tags = get_sents_and_tags(interp_sents, i2w, w2i)
        pickle_it(z.cpu().numpy(), 'samples/z_interp_n{}.pkl'.format(args.num_samples))
        pickle_it(interp_sents, 'samples/sents_interp_n{}.pkl'.format(args.num_samples))
        pickle_it(interp_tags, 'samples/tags_interp_n{}.pkl'.format(args.num_samples))
        print(interp_sents, sep='\n')

        if args.constraint_mode:
            print('*** INTERP Z_PRIME: ***')
            all_tags_interp_prime = []
            all_sents_interp_prime = {}
            all_z_interp_prime = {}

            for i, condition in enumerate(LABELS):

                # binary vector denoting each of the PHRASE_TAGS
                labels = torch.Tensor(condition).repeat(args.num_samples, 1).cuda()

                # z prime conditioned on this particular binary variable
                z_prime = actor.forward(z, labels)

                interp_sents_prime, z_prime = model.inference(
                    z=z_prime, n=args.num_samples)
                interp_sents_prime, interp_tags_prime = get_sents_and_tags(
                    interp_sents_prime, i2w, w2i)
                print('conditoned on: {}'.format(condition))
                print(interp_sents_prime, sep='\n')
                all_tags_interp_prime.append(interp_tags_prime)
                all_sents_interp_prime[LABEL_NAMES[i]] = interp_sents_prime
                all_z_interp_prime[LABEL_NAMES[i]] = z_prime.data.cpu().numpy()

            pickle_it(all_tags_interp_prime, 'samples/tags_interp_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_sents_interp_prime, 'samples/sents_interp_prime_n{}.pkl'.format(args.num_samples))
            pickle_it(all_z_interp_prime, 'samples/z_interp_prime_n{}.pkl'.format(args.num_samples))

    import IPython; IPython.embed()
Esempio n. 6
0
def main(args):
    ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime())

    splits = ['train', 'valid'] + (['test'] if args.test else [])

    RANDOM_SEED = 42

    dataset = load_dataset("yelp_polarity", split="train")
    TRAIN_SIZE = len(dataset) - 2_000
    VALID_SIZE = 1_000
    TEST_SIZE = 1_000

    train_test_split = dataset.train_test_split(train_size=TRAIN_SIZE,
                                                seed=RANDOM_SEED)
    train_dataset = train_test_split["train"]
    test_val_dataset = train_test_split["test"].train_test_split(
        train_size=VALID_SIZE, test_size=TEST_SIZE, seed=RANDOM_SEED)
    val_dataset, test_dataset = test_val_dataset["train"], test_val_dataset[
        "test"]

    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)
    datasets = OrderedDict()
    datasets['train'] = TextDataset(train_dataset, tokenizer,
                                    args.max_sequence_length,
                                    not args.disable_sent_tokenize)
    datasets['valid'] = TextDataset(val_dataset, tokenizer,
                                    args.max_sequence_length,
                                    not args.disable_sent_tokenize)
    if args.test:
        datasets['text'] = TextDataset(test_dataset, tokenizer,
                                       args.max_sequence_length,
                                       not args.disable_sent_tokenize)

    print(
        f"Loading {args.model_name} model. Setting {args.trainable_layers} trainable layers."
    )
    encoder = AutoModel.from_pretrained(args.model_name, return_dict=True)
    if not args.train_embeddings:
        for p in encoder.embeddings.parameters():
            p.requires_grad = False
    encoder_layers = encoder.encoder.layer
    if args.trainable_layers > len(encoder_layers):
        warnings.warn(
            f"You are asking to train {args.trainable_layers} layers, but this model has only {len(encoder_layers)}"
        )
    for layer in range(len(encoder_layers) - args.trainable_layers):
        for p in encoder_layers[layer].parameters():
            p.requires_grad = False
    params = dict(vocab_size=datasets['train'].vocab_size,
                  embedding_size=args.embedding_size,
                  rnn_type=args.rnn_type,
                  hidden_size=args.hidden_size,
                  word_dropout=args.word_dropout,
                  embedding_dropout=args.embedding_dropout,
                  latent_size=args.latent_size,
                  num_layers=args.num_layers,
                  bidirectional=args.bidirectional,
                  max_sequence_length=args.max_sequence_length)
    model = SentenceVAE(encoder=encoder, tokenizer=tokenizer, **params)

    if torch.cuda.is_available():
        model = model.cuda()

    print(model)

    if args.tensorboard_logging:
        writer = SummaryWriter(
            os.path.join(args.logdir, expierment_name(args, ts)))
        writer.add_text("model", str(model))
        writer.add_text("args", str(args))
        writer.add_text("ts", ts)

    save_model_path = os.path.join(args.save_model_path, ts)
    os.makedirs(save_model_path)

    with open(os.path.join(save_model_path, 'model_params.json'), 'w') as f:
        json.dump(params, f, indent=4)
    with open(os.path.join(save_model_path, 'train_args.json'), 'w') as f:
        json.dump(vars(args), f, indent=4)

    def kl_anneal_function(anneal_function, step, k, x0):
        if step <= x0:
            return args.initial_kl_weight
        if anneal_function == 'logistic':
            return float(1 / (1 + np.exp(-k * (step - x0 - 2500))))
        elif anneal_function == 'linear':
            return min(1, step / x0)

    NLL = torch.nn.NLLLoss(ignore_index=datasets['train'].pad_idx,
                           reduction='sum')

    def loss_fn(logp, target, length, mean, logv, anneal_function, step, k,
                x0):

        # cut-off unnecessary padding from target, and flatten
        target = target[:, :torch.max(length).item()].contiguous().view(-1)
        logp = logp.view(-1, logp.size(2))

        # Negative Log Likelihood
        NLL_loss = NLL(logp, target)

        # KL Divergence
        KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp())
        KL_weight = kl_anneal_function(anneal_function, step, k, x0)

        return NLL_loss, KL_loss, KL_weight

    params = [{
        'params': model.encoder.parameters(),
        'lr': args.encoder_learning_rate
    }, {
        'params': [
            *model.decoder_rnn.parameters(), *model.hidden2mean.parameters(),
            *model.hidden2logv.parameters(), *model.latent2hidden.parameters(),
            *model.outputs2vocab.parameters()
        ]
    }]
    optimizer = torch.optim.Adam(params,
                                 lr=args.learning_rate,
                                 weight_decay=args.weight_decay)

    tensor = torch.cuda.FloatTensor if torch.cuda.is_available(
    ) else torch.Tensor
    step = 0
    for epoch in range(args.epochs):

        for split in splits:

            data_loader = DataLoader(dataset=datasets[split],
                                     batch_size=args.batch_size,
                                     shuffle=(split == 'train'),
                                     num_workers=cpu_count(),
                                     pin_memory=torch.cuda.is_available(),
                                     collate_fn=DataCollator(tokenizer))

            tracker = defaultdict(tensor)

            # Enable/Disable Dropout
            if split == 'train':
                model.train()
            else:
                model.eval()

            for iteration, batch in enumerate(data_loader):

                batch_size = batch['input'].size(0)

                for k, v in batch.items():
                    if torch.is_tensor(v):
                        batch[k] = to_var(v)

                # Forward pass
                logp, mean, logv, z = model(batch['input'],
                                            batch['attention_mask'],
                                            batch['length'])

                # loss calculation
                NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'],
                                                       batch['length'], mean,
                                                       logv,
                                                       args.anneal_function,
                                                       step, args.k, args.x0)

                loss = (NLL_loss + KL_weight * KL_loss) / batch_size

                # backward + optimization
                if split == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    step += 1

                # bookkeepeing
                tracker['ELBO'] = torch.cat(
                    (tracker['ELBO'], loss.data.view(1, -1)), dim=0)

                if args.tensorboard_logging:
                    writer.add_scalar("%s/ELBO" % split.upper(), loss.item(),
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/NLL Loss" % split.upper(),
                                      NLL_loss.item() / batch_size,
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Loss" % split.upper(),
                                      KL_loss.item() / batch_size,
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Weight" % split.upper(),
                                      KL_weight,
                                      epoch * len(data_loader) + iteration)

                if iteration % args.print_every == 0 or iteration + 1 == len(
                        data_loader):
                    print(
                        "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f"
                        % (split.upper(), iteration, len(data_loader) - 1,
                           loss.item(), NLL_loss.item() / batch_size,
                           KL_loss.item() / batch_size, KL_weight))

                if split == 'valid':
                    if 'target_sents' not in tracker:
                        tracker['target_sents'] = list()
                    tracker['target_sents'] += idx2word(
                        batch['target'].tolist(), tokenizer=tokenizer)
                    tracker['z'] = torch.cat((tracker['z'], z.data), dim=0)

            print("%s Epoch %02d/%i, Mean ELBO %9.4f" %
                  (split.upper(), epoch, args.epochs, tracker['ELBO'].mean()))

            if args.tensorboard_logging:
                writer.add_scalar("%s-Epoch/ELBO" % split.upper(),
                                  torch.mean(tracker['ELBO']), epoch)

            # save a dump of all sentences, the encoded latent space and generated sequences
            if split == 'valid':
                samples, _ = model.inference(z=tracker['z'])
                generated_sents = idx2word(samples.tolist(), tokenizer)
                sents = [{
                    'original': target,
                    'generated': generated
                } for target, generated in zip(tracker['target_sents'],
                                               generated_sents)]
                dump = {'sentences': sents, 'z': tracker['z'].tolist()}
                if not os.path.exists(os.path.join('dumps', ts)):
                    os.makedirs('dumps/' + ts)
                with open(
                        os.path.join('dumps/' + ts +
                                     '/valid_E%i.json' % epoch),
                        'w') as dump_file:
                    json.dump(dump, dump_file, indent=3)

            # save checkpoint
            if split == 'train':
                checkpoint_path = os.path.join(save_model_path,
                                               "E%i.pytorch" % epoch)
                torch.save(model.state_dict(), checkpoint_path)
                print("Model saved at %s" % checkpoint_path)
Esempio n. 7
0
def main(args):

    # Load the vocab
    with open(args.data_dir+'/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime())

    splits = ['train', 'valid'] + (['test'] if args.test else [])

    # Initialize semantic loss
    sl = Semantic_Loss()

    datasets = OrderedDict()
    for split in splits:
        datasets[split] = PTB(
            data_dir=args.data_dir,
            split=split,
            create_data=args.create_data,
            max_sequence_length=args.max_sequence_length,
            min_occ=args.min_occ
        )

    params = dict(
        vocab_size=datasets['train'].vocab_size,
        sos_idx=datasets['train'].sos_idx,
        eos_idx=datasets['train'].eos_idx,
        pad_idx=datasets['train'].pad_idx,
        unk_idx=datasets['train'].unk_idx,
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
    )
    model = SentenceVAE(**params)

    if torch.cuda.is_available():
        model = model.cuda()

    print(model)

    if args.tensorboard_logging:
        writer = SummaryWriter(os.path.join(args.logdir, expierment_name(args, ts)))
        writer.add_text("model", str(model))
        writer.add_text("args", str(args))
        writer.add_text("ts", ts)

    save_model_path = os.path.join(args.save_model_path, ts)
    os.makedirs(save_model_path)

    with open(os.path.join(save_model_path, 'model_params.json'), 'w') as f:
        json.dump(params, f, indent=4)

    def kl_anneal_function(anneal_function, step, k, x0):
        if anneal_function == 'logistic':
            return float(1/(1+np.exp(-k*(step-x0))))
        elif anneal_function == 'linear':
            return min(1, step/x0)

    def perplexity_anneal_function(anneal_function, step, k, x0):
        if anneal_function == 'logistic':
            return float(1/ 1+np.exp(-k*(step-x0)))
        elif anneal_function == 'linear':
            return min(1, (step/x0))

    NLL = torch.nn.NLLLoss(ignore_index=datasets['train'].pad_idx, reduction='sum')
    def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0, \
        batch_perplexity, perplexity_anneal_function):

        # cut-off unnecessary padding from target, and flatten
        target = target[:, :torch.max(length).item()].contiguous().view(-1)
        logp = logp.view(-1, logp.size(2))

        # Negative Log Likelihood
        NLL_loss = NLL(logp, target)

        # KL Divergence
        KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp())
        KL_weight = kl_anneal_function(anneal_function, step, k, x0)

        # Perplexity
        perp_loss = batch_perplexity
        perp_weight = perplexity_anneal_function(anneal_function, step, k, x0)

        return NLL_loss, KL_loss, KL_weight, perp_loss, perp_weight


    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor
    step = 0
    for epoch in range(args.epochs):

        # Keep track of epoch loss
        epoch_loss = []

        for split in splits:

            data_loader = DataLoader(
                dataset=datasets[split],
                batch_size=args.batch_size,
                shuffle=split=='train',
                num_workers=cpu_count(),
                pin_memory=torch.cuda.is_available()
            )

            tracker = defaultdict(tensor)

            # Enable/Disable Dropout
            if split == 'train':
                model.train()
            else:
                model.eval()

            batch_t_start = None

            for iteration, batch in enumerate(data_loader):

                if batch_t_start:
                    batch_run_time = time.time() - batch_t_start
                    # print("Batch run time: " + str(batch_run_time))
                batch_t_start = time.time()


                batch_size = batch['input_sequence'].size(0)

                for k, v in batch.items():
                    if torch.is_tensor(v):
                        batch[k] = to_var(v)

                # Get the original sentences in this batch
                batch_sentences = idx2word(batch['input_sequence'], i2w=i2w, pad_idx=w2i['<pad>'])
                # Remove the first tag
                batch_sentences = [x.replace("<sos>", "") for x in batch_sentences]

                # Forward pass
                (logp, mean, logv, z), states = model(**batch)


                # Choose some random pairs of samples within the batch
                #  to get latent representations for
                batch_index_pairs = list(itertools.combinations(np.arange(batch_size), 2))
                random.shuffle(batch_index_pairs)
                batch_index_pairs = batch_index_pairs[:args.perplexity_samples_per_batch]

                batch_perplexity = []

                # If we start the perplexity
                start_perplexity = epoch > 10

                # If we should have perplexity loss
                if start_perplexity and args.perplexity_loss:
                    # For each pair, get the intermediate representations in the latent space
                    for index_pair in batch_index_pairs:

                        with torch.no_grad():
                            z1_hidden = states['z'][index_pair[0]].cpu()
                            z2_hidden = states['z'][index_pair[1]].cpu()

                        z_hidden = to_var(torch.from_numpy(interpolate(start=z1_hidden, end=z2_hidden, steps=1)).float())

                        if args.rnn_type == "lstm":

                            with torch.no_grad():
                                z1_cell_state = states['z_cell_state'].cpu().squeeze()[index_pair[0]]
                                z2_cell_state = states['z_cell_state'].cpu().squeeze()[index_pair[1]]

                            z_cell_states = \
                                to_var(torch.from_numpy(interpolate(start=z1_cell_state, end=z2_cell_state, steps=1)).float())

                            samples, _ = model.inference(z=z_hidden, z_cell_state=z_cell_states)
                        else:
                            samples, _ = model.inference(z=z_hidden, z_cell_state=None)

                        # Check interpolated sentences
                        interpolated_sentences = idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>'])
                        # For each sentence, get the perplexity and show it
                        perplexities = []
                        for sentence in interpolated_sentences:
                            perplexities.append(sl.get_perplexity(sentence))
                        avg_sample_perplexity = sum(perplexities) / len(perplexities)
                        batch_perplexity.append(avg_sample_perplexity)
                    # Calculate batch perplexity
                    avg_batch_perplexity = sum(batch_perplexity) / len(batch_perplexity)

                    # loss calculation
                    NLL_loss, KL_loss, KL_weight, perp_loss, perp_weight = loss_fn(logp, batch['target'],
                        batch['length'], mean, logv, args.anneal_function, step, \
                            args.k, args.x0, avg_batch_perplexity, perplexity_anneal_function)

                    loss = ((NLL_loss + KL_weight * KL_loss) / batch_size) + (perp_loss * perp_weight)

                else: # Epochs < X, so train without perplexity
                    # loss calculation
                    NLL_loss, KL_loss, KL_weight, perp_loss, perp_weight = loss_fn(logp, batch['target'],
                        batch['length'], mean, logv, args.anneal_function, step, \
                            args.k, args.x0, 0, perplexity_anneal_function)

                    loss = (NLL_loss + KL_weight * KL_loss) / batch_size


                # Turn model back into train, since inference changed to eval
                if split == 'train':
                    model.train()
                else:
                    model.eval()

                # backward + optimization
                if split == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    step += 1

                    # Add loss
                    epoch_loss.append(loss.item())

                # bookkeepeing
                tracker['ELBO'] = torch.cat((tracker['ELBO'], loss.data.view(1, -1)), dim=0)

                if args.tensorboard_logging:
                    writer.add_scalar("%s/ELBO" % split.upper(), loss.item(), epoch*len(data_loader) + iteration)
                    writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.item() / batch_size,
                                      epoch*len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.item() / batch_size,
                                      epoch*len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight,
                                      epoch*len(data_loader) + iteration)

                if iteration % args.print_every == 0 or iteration+1 == len(data_loader):
                    print("%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f, Perp-loss %9.4f, Perp-weight %6.3f"
                          % (split.upper(), iteration, len(data_loader)-1, loss.item(), NLL_loss.item()/batch_size,
                          KL_loss.item()/batch_size, KL_weight, perp_loss, perp_weight))

                if split == 'valid':
                    if 'target_sents' not in tracker:
                        tracker['target_sents'] = list()
                    tracker['target_sents'] += idx2word(batch['target'].data, i2w=datasets['train'].get_i2w(),
                                                        pad_idx=datasets['train'].pad_idx)
                    tracker['z'] = torch.cat((tracker['z'], z.data), dim=0)

            print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, tracker['ELBO'].mean()))

            if args.tensorboard_logging:
                writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch)

            # save a dump of all sentences and the encoded latent space
            if split == 'valid':
                dump = {'target_sents': tracker['target_sents'], 'z': tracker['z'].tolist()}
                if not os.path.exists(os.path.join('dumps', ts)):
                    os.makedirs('dumps/'+ts)
                with open(os.path.join('dumps/'+ts+'/valid_E%i.json' % epoch), 'w') as dump_file:
                    json.dump(dump,dump_file)

            # save checkpoint
            if split == 'train':
                checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % epoch)
                torch.save(model.state_dict(), checkpoint_path)
                print("Model saved at %s" % checkpoint_path)
Esempio n. 8
0
def main(args):

    data_name = args.data_name
    with open(args.data_dir+data_name+'.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
        )

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s"%(args.load_checkpoint))

    if torch.cuda.is_available():
        model = model.cuda()
    
    model.eval()

    # samples, z = model.inference(n=args.num_samples)
    # print('----------SAMPLES----------')
    # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
    
    # z1 = torch.randn([args.latent_size]).numpy()
    # z2 = torch.randn([args.latent_size]).numpy()
    # z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    # samples, _ = model.inference(z=z)
    # print('-------INTERPOLATION-------')
    # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    # print('-------Encode ... Decode-------')
    
    # datasets = Amazon(
    #         data_dir=args.data_dir,
    #         split="valid",
    #         create_data=False,
    #         batch_size=10,
    #         max_sequence_length=args.max_sequence_length,
    #         min_occ=3
    #     )


    ### load vocab
    # with open(os.path.join(args.data_dir, args.vocab_file), 'r') as file:
    #     vocab = json.load(file)
    #     w2i, i2w = vocab['w2i'], vocab['i2w']

    tokenizer = TweetTokenizer(preserve_case=False)

    # raw_text = "I like this!"
    raw_text = "DON'T CARE FOR IT.  GAVE IT AS A GIFT AND THEY WERE OKAY WITH IT.  JUST NOT WHAT I EXPECTED."
    input_text = f_raw2vec(tokenizer, raw_text, w2i, i2w)
    length_text = len(input_text)
    length_text = [length_text]
    print("length_text", length_text)

    input_tensor = torch.LongTensor(input_text)
    print('input_tensor', input_tensor)
    input_tensor = input_tensor.unsqueeze(0)
    if torch.is_tensor(input_tensor):
        input_tensor = to_var(input_tensor)

    length_tensor = torch.LongTensor(length_text)
    print("length_tensor", length_tensor)
    # length_tensor = length_tensor.unsqueeze(0)
    if torch.is_tensor(length_tensor):
        length_tensor = to_var(length_tensor)
    
    print("*"*10)
    print("->"*10, *idx2word(input_tensor, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
    logp, mean, logv, z = model(input_tensor, length_tensor)

    # print("z", z.size(), mean_z.size())
    mean = mean.unsqueeze(0)
    print("mean", mean)
    print("z", z)

    samples, z = model.inference(z=mean)
    print("<-"*10, *idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    for i in range(10):
        samples, z = model.inference(z=z)
        print("<-"*10, *idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
def main(args):
    ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime())

    splits = ['train', 'valid'] + (['test'] if args.test else [])

    datasets = OrderedDict()
    for split in splits:
        datasets[split] = PTB(data_dir=args.data_dir,
                              split=split,
                              create_data=args.create_data,
                              max_sequence_length=args.max_sequence_length,
                              min_occ=args.min_occ)

    model = SentenceVAE(vocab_size=datasets['train'].vocab_size,
                        sos_idx=datasets['train'].sos_idx,
                        eos_idx=datasets['train'].eos_idx,
                        pad_idx=datasets['train'].pad_idx,
                        unk_idx=datasets['train'].unk_idx,
                        max_sequence_length=args.max_sequence_length,
                        embedding_size=args.embedding_size,
                        rnn_type=args.rnn_type,
                        hidden_size=args.hidden_size,
                        word_dropout=args.word_dropout,
                        embedding_dropout=args.embedding_dropout,
                        latent_size=args.latent_size,
                        num_layers=args.num_layers,
                        bidirectional=args.bidirectional)

    if torch.cuda.is_available():
        model = model.cuda()

    print(model)

    if args.tensorboard_logging:
        writer = SummaryWriter(
            os.path.join(args.logdir, experiment_name(args, ts)))
        writer.add_text("model", str(model))
        writer.add_text("args", str(args))
        writer.add_text("ts", ts)

    save_model_path = os.path.join(args.save_model_path, ts)
    os.makedirs(save_model_path)

    total_steps = (len(datasets["train"]) // args.batch_size) * args.epochs
    print("Train dataset size", total_steps)

    def kl_anneal_function(anneal_function, step):
        if anneal_function == 'identity':
            return 1
        if anneal_function == 'linear':
            if args.warmup is None:
                return 1 - (total_steps - step) / total_steps
            else:
                warmup_steps = (total_steps / args.epochs) * args.warmup
                return 1 - (warmup_steps - step
                            ) / warmup_steps if step < warmup_steps else 1.0

    ReconLoss = torch.nn.NLLLoss(size_average=False,
                                 ignore_index=datasets['train'].pad_idx)

    def loss_fn(logp, target, length, mean, logv, anneal_function, step):

        # cut-off unnecessary padding from target, and flatten
        target = target[:, :torch.max(length).data[0]].contiguous().view(-1)
        logp = logp.view(-1, logp.size(2))

        # Negative Log Likelihood
        recon_loss = ReconLoss(logp, target)

        # KL Divergence
        KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp())
        KL_weight = kl_anneal_function(anneal_function, step)

        return recon_loss, KL_loss, KL_weight

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    tensor = torch.cuda.FloatTensor if torch.cuda.is_available(
    ) else torch.Tensor
    step = 0
    for epoch in range(args.epochs):

        for split in splits:

            data_loader = DataLoader(dataset=datasets[split],
                                     batch_size=args.batch_size,
                                     shuffle=split == 'train',
                                     num_workers=cpu_count(),
                                     pin_memory=torch.cuda.is_available())

            tracker = defaultdict(tensor)

            # Enable/Disable Dropout
            if split == 'train':
                model.train()
            else:
                model.eval()

            for iteration, batch in enumerate(data_loader):

                batch_size = batch['input'].size(0)

                for k, v in batch.items():
                    if torch.is_tensor(v):
                        batch[k] = to_var(v)

                # Forward pass
                logp, mean, logv, z = model(batch['input'], batch['length'])

                # loss calculation
                recon_loss, KL_loss, KL_weight = loss_fn(
                    logp, batch['target'], batch['length'], mean, logv,
                    args.anneal_function, step)

                if split == 'train':
                    loss = (recon_loss + KL_weight * KL_loss) / batch_size
                else:
                    # report complete elbo when validation
                    loss = (recon_loss + KL_loss) / batch_size

                # backward + optimization
                if split == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    step += 1

                # bookkeepeing
                tracker['negELBO'] = torch.cat(
                    (tracker['negELBO'], loss.data.unsqueeze(0)))

                if args.tensorboard_logging:
                    neg_elbo = (recon_loss + KL_loss) / batch_size
                    writer.add_scalar("%s/Negative_ELBO" % split.upper(),
                                      neg_elbo.data[0],
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/Recon_Loss" % split.upper(),
                                      recon_loss.data[0] / batch_size,
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/KL_Loss" % split.upper(),
                                      KL_loss.data[0] / batch_size,
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/KL_Weight" % split.upper(),
                                      KL_weight,
                                      epoch * len(data_loader) + iteration)

                if iteration % args.print_every == 0 or iteration + 1 == len(
                        data_loader):
                    logger.info(
                        "%s Batch %04d/%i, Loss %9.4f, Recon-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f"
                        % (split.upper(), iteration, len(data_loader) - 1,
                           loss.data[0], recon_loss.data[0] / batch_size,
                           KL_loss.data[0] / batch_size, KL_weight))

                if split == 'valid':
                    if 'target_sents' not in tracker:
                        tracker['target_sents'] = list()
                    tracker['target_sents'] += idx2word(
                        batch['target'].data,
                        i2w=datasets['train'].get_i2w(),
                        pad_idx=datasets['train'].pad_idx)
                    tracker['z'] = torch.cat((tracker['z'], z.data), dim=0)

            logger.info("%s Epoch %02d/%i, Mean Negative ELBO %9.4f" %
                        (split.upper(), epoch, args.epochs,
                         torch.mean(tracker['negELBO'])))

            if args.tensorboard_logging:
                writer.add_scalar("%s-Epoch/NegELBO" % split.upper(),
                                  torch.mean(tracker['negELBO']), epoch)

            # save a dump of all sentences and the encoded latent space
            if split == 'valid':
                dump = {
                    'target_sents': tracker['target_sents'],
                    'z': tracker['z'].tolist()
                }
                if not os.path.exists(os.path.join('dumps', ts)):
                    os.makedirs('dumps/' + ts)
                with open(
                        os.path.join('dumps/' + ts +
                                     '/valid_E%i.json' % epoch),
                        'w') as dump_file:
                    json.dump(dump, dump_file)

            # save checkpoint
            if split == 'train':
                checkpoint_path = os.path.join(save_model_path,
                                               "E%i.pytorch" % (epoch))
                torch.save(model.state_dict(), checkpoint_path)
                logger.info("Model saved at %s" % checkpoint_path)

    if args.num_samples:
        torch.cuda.empty_cache()
        model.eval()
        with torch.no_grad():
            print(f"Generating {args.num_samples} samples")
            generations, _ = model.inference(n=args.num_samples)
            vocab = datasets["train"].i2w

            print(
                "Sampled latent codes from z ~ N(0, I), generated sentences:")
            for i, generation in enumerate(generations, start=1):
                sentence = [vocab[str(word.item())] for word in generation]
                print(f"{i}:", " ".join(sentence))
Esempio n. 10
0
def generate(date, epoch, sentiment, n_samples):
    date = date
    cuda2 = torch.device('cuda:0')
    epoch = epoch
    #date = "2020-Feb-26-17:47:47"
    #exp_descr = pd.read_csv("EXP_DESCR/" + date + ".csv")
    #print("Pretained: ", exp_descr['pretrained'][0])
    #print("Bidirectional: ", exp_descr['Bidirectional'][0])
    #epoch = str(10)
    #data_dir = 'data'
    #

    params = pd.read_csv("Parameters/params.csv")
    params = params.set_index('time')
    exp_descr = params.loc[date]
    # 2019-Dec-02-09:35:25, 60,300,256,0.3,0.5,16,False,0.001,10,False

    embedding_size = exp_descr["embedding_size"]
    hidden_size = exp_descr["hidden_size"]
    rnn_type = exp_descr['rnn_type']
    word_dropout = exp_descr["word_dropout"]
    embedding_dropout = exp_descr["embedding_dropout"]
    latent_size = exp_descr["latent_size"]
    num_layers = 1
    batch_size = exp_descr["batch_size"]
    bidirectional = bool(exp_descr["bidirectional"])
    max_sequence_length = exp_descr["max_sequence_length"]
    back = exp_descr["back"]
    attribute_size = exp_descr["attr_size"]
    wd_type = exp_descr["word_drop_type"]
    num_samples = 2
    save_model_path = 'bin'
    ptb = False
    if ptb == True:
        vocab_dir = '/ptb.vocab.json'
    else:
        vocab_dir = '/yelp_vocab.json'

    with open("bin/" + date + "/" + vocab_dir, 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(vocab_size=len(w2i),
                        sos_idx=w2i['<sos>'],
                        eos_idx=w2i['<eos>'],
                        pad_idx=w2i['<pad>'],
                        unk_idx=w2i['<unk>'],
                        max_sequence_length=max_sequence_length,
                        embedding_size=embedding_size,
                        rnn_type=rnn_type,
                        hidden_size=hidden_size,
                        word_dropout=0,
                        embedding_dropout=0,
                        latent_size=latent_size,
                        num_layers=num_layers,
                        cuda=cuda2,
                        bidirectional=bidirectional,
                        attribute_size=attribute_size,
                        word_dropout_type='static',
                        back=back)

    print(model)
    # Results
    # 2019-Nov-28-13:23:06/E4-5".pytorch"

    load_checkpoint = "bin/" + date + "/" + "E" + str(epoch) + ".pytorch"
    # load_checkpoint = "bin/2019-Nov-28-12:03:44 /E0.pytorch"

    if not os.path.exists(load_checkpoint):
        raise FileNotFoundError(load_checkpoint)

    if torch.cuda.is_available():
        model = model.cuda()
        device = "cuda"
    else:
        device = "cpu"

    model.load_state_dict(
        torch.load(load_checkpoint, map_location=torch.device(device)))

    def attr_generation(n):
        labels = np.random.randint(2, size=n)
        enc = OneHotEncoder(handle_unknown='ignore')
        labels = np.reshape(labels, (len(labels), 1))
        enc.fit(labels)
        one_hot = enc.transform(labels).toarray()
        one_hot = one_hot.astype(np.float32)
        one_hot = torch.from_numpy(one_hot)
        return one_hot

    model.eval()
    labels = attr_generation(n=num_samples)

    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    from sklearn.metrics import accuracy_score
    analyser = SentimentIntensityAnalyzer()

    def sentiment_analyzer_scores(sentence):
        score = analyser.polarity_scores(sentence)
        if score['compound'] > 0.05:
            return 1, 'Positive'
        else:
            return 0, 'Negative'

    print('----------SAMPLES----------')
    labels = []
    generated = []
    for i in range(n_samples):
        samples, z, l = model.inference(sentiment)
        s = idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>'])
        #print(sentiment_analyzer_scores(s[0]))
        if sentiment_analyzer_scores(s[0])[1] == sentiment:
            generated.append(s[0])

        labels.append(sentiment_analyzer_scores(s[0])[0])
        #print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
    print(sum(labels))
    translation = idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>'])
    return generated
    '''
Esempio n. 11
0
model = SentenceVAE(vocab_size=weights.size(0),
                    sos_idx=w2i['<sos>'],
                    eos_idx=w2i['<eos>'],
                    pad_idx=w2i['<pad>'])

model.load_state_dict(torch.load(args.load_checkpoint))
print("Model loaded from %s" % (args.load_checkpoint))

if torch.cuda.is_available():
    model = model.cuda()

model.eval()

print('----------SAMPLES----------')
for i in range(5):
    sample, z = model.inference()
    sample = sample.cpu().numpy()
    print(sample)
    print(idx2word(sample, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

datasets = OrderedDict()
datasets['test'] = PTB(data_dir=args.data_dir,
                       split='test',
                       create_data=args.create_data,
                       max_sequence_length=60,
                       min_occ=args.min_occ)

print('-------RECONSTRUCTION-------')

sample = datasets['test'].data['300']['input']
print('sample 1: ' + idx2word(sample[1:], i2w=i2w, pad_idx=w2i['<pad>']),
Esempio n. 12
0
def main(args):

    with open(args.data_dir+'/ptb.vocab.json', 'r') as file:
        vocab = json.load(file)

    w2i, i2w = vocab['w2i'], vocab['i2w']

    model = SentenceVAE(
        vocab_size=len(w2i),
        sos_idx=w2i['<sos>'],
        eos_idx=w2i['<eos>'],
        pad_idx=w2i['<pad>'],
        unk_idx=w2i['<unk>'],
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
        )

    if not os.path.exists(args.load_checkpoint):
        raise FileNotFoundError(args.load_checkpoint)

    model.load_state_dict(torch.load(args.load_checkpoint))
    print("Model loaded from %s"%(args.load_checkpoint))

    if torch.cuda.is_available():
        model = model.cuda()
    
    model.eval()

    # samples, z = model.inference(n=args.num_samples)
    # print('----------SAMPLES----------')
    # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    # z1 = torch.randn([args.latent_size]).numpy()
    # z2 = torch.randn([args.latent_size]).numpy()
    # z = to_var(torch.from_numpy(interpolate(start=z1, end=z2, steps=8)).float())
    # samples, _ = model.inference(z=z)
    # print('-------INTERPOLATION-------')
    # print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

    print('-------Encode ... Decode-------')
    
    datasets = PTB(
            data_dir=args.data_dir,
            split="valid",
            create_data=False,
            max_sequence_length=args.max_sequence_length,
            min_occ=1
        )
    data_loader = DataLoader(dataset=datasets, batch_size=2, shuffle='valid',num_workers=cpu_count(), pin_memory=torch.cuda.is_available())

    for iteration, batch in enumerate(data_loader):
        batch_size = batch['input'].size(0)
        for k, v in batch.items():
            if torch.is_tensor(v):
                batch[k] = to_var(v)

        print("*"*10)
        print(*idx2word(batch['input'], i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')
        logp, mean, logv, z = model(batch['input'], batch['length'])

        print("+"*10)
        samples, z = model.inference(z=z)
        print(*idx2word(samples, i2w=i2w, pad_idx=w2i['<pad>']), sep='\n')

        
        if iteration == 0:
            break