Ejemplo n.º 1
0
def main():
    # prepare corpus
    corpus = Corpus(args.data_file, args.dict_file, vocab_size=args.vocab_size)

    # dumping vocabulary
    with open(os.path.join(out_dir, 'vocab.json'), 'w') as f:
        json.dump(corpus.dictionary.word2idx, f)

    # save arguments
    ntokens = len(corpus.dictionary.word2idx)
    args.ntokens = ntokens
    with open(os.path.join(out_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f)
    log.info('[Data Loaded.]')

    autoencoder = AutoEncoder()

    if args.split:
        train, valid = corpus.get_data(split=args.split)
        valid = batchify(valid, args.batch_size, shuffle=False)
    else:
        train = corpus.get_data()

    for epoch in range(1, args.epochs + 1):
        # shuffle train data in each epoch
        batches = batchify(train, args.batch_size, shuffle=True)

        global_iters = 0
        start_time = datetime.now()

        for i, batch in enumerate(batches):
            loss = autoencoder.update(batch)
            if i % args.log_interval == 0 and i > 0:
                log.info(('[Epoch {} {}/{} Loss {:.5f} ETA {}]').format(
                    epoch, i, len(batches), loss,
                    str((datetime.now() - start_time) / (i + 1) *
                        (len(batches) - i - 1)).split('.')[0]))

            global_iters += 1
            if global_iters % 100 == 0:
                autoencoder.anneal()

        if args.split:
            word_acc, sent_acc = autoencoder.evaluate(valid)
            msg = 'Epoch {} word acc: {} | sent acc: {}'.format(
                epoch, word_acc, sent_acc)
            log.warn(msg)
        autoencoder.save(out_dir, 'autoencoder_model_{}.pt'.format(epoch))
Ejemplo n.º 2
0
def main():
    # prepare corpus
    ae_args = json.load(open(args.ae_args))
    corpus = Corpus(args.data_file,
                    args.dict_file,
                    vocab_size=ae_args['vocab_size'])
    autoencoder = Seq2Seq(emsize=ae_args['emsize'],
                          nhidden=ae_args['nhidden'],
                          ntokens=ae_args['ntokens'],
                          nlayers=ae_args['nlayers'],
                          hidden_init=ae_args['hidden_init'],
                          max_len=ae_args['max_len'],
                          gpu=args.cuda)
    autoencoder.load_state_dict(torch.load(args.model))
    if args.cuda:
        autoencoder.cuda()
    autoencoder.eval()

    if args.split:
        train, valid = corpus.get_data(split=args.split)
    else:
        valid = corpus.get_data()
    samples = batchify(random.sample(valid, args.len_samples),
                       args.batch_size,
                       shuffle=False)
    valid = batchify(valid, args.batch_size, shuffle=False)

    word_accuracies = []
    sent_accuracies = []

    f = open(args.err_f, 'w')
    for i, batch in enumerate(tqdm(valid, desc='acc')):
        source, target, length = batch
        source = to_gpu(args.cuda, Variable(source, volatile=True))
        target = to_gpu(args.cuda, Variable(target, volatile=True))
        length = to_gpu(args.cuda, Variable(length, volatile=True))

        # output: batch x seq_len x ntokens
        code = autoencoder.encode(source)
        max_indices = autoencoder.generate(code, length).contiguous()

        # ============word accuracy============
        word_accuracies.extend(  # strip the last <eos>
            max_indices.view(-1).eq(
                target[:, :-1].contiguous().view(-1)).data.cpu().tolist())

        # ==============generate examples==================
        max_indices = max_indices.data.cpu().numpy()
        target = target.data.cpu().numpy()

        for t, idx in zip(target, max_indices):
            # real sentence
            real = "".join([
                corpus.dictionary.idx2word[x] for x in t
                if x >= corpus.dictionary.offset
            ])
            # autoencoder output sentence
            gen = "".join([
                corpus.dictionary.idx2word[x] for x in idx
                if x >= corpus.dictionary.offset
            ])
            correct = real == gen
            sent_accuracies.append(correct)
            if not correct:
                f.write('{} | {}\n'.format(real, gen))
    f.close()

    log.info('word acc: {} sent acc: {}'.format(np.mean(word_accuracies),
                                                np.mean(sent_accuracies)))

    f = open(args.len_f, 'w')
    for i, batch in enumerate(tqdm(samples, desc='len')):
        source, target, length = batch
        source = to_gpu(args.cuda, Variable(source, volatile=True))
        target = to_gpu(args.cuda, Variable(target, volatile=True))

        target = target.view_as(source).data.cpu().numpy()

        one = torch.LongTensor([1]).expand_as(length)
        indices = []
        for j in range(-2, 3):
            length_ = torch.max(length + j, one)
            length_ = to_gpu(args.cuda, Variable(length_, volatile=True))
            code = autoencoder.encode(source)
            max_indices = autoencoder.generate(code, length_)
            indices.append(max_indices.data.cpu().numpy())

        for k, target_ in enumerate(target):
            # real sentence
            real = "".join([
                corpus.dictionary.idx2word[x] for x in target_
                if x >= corpus.dictionary.offset
            ])
            f.write('origin: {}\n'.format(real))
            for j in range(-2, 3):
                idx = indices[j][k]
                # autoencoder output sentence
                gen = "".join([
                    corpus.dictionary.idx2word[x] for x in idx
                    if x >= corpus.dictionary.offset
                ])
                f.write('{} {}\n'.format(j if j < 0 else '+' + str(j), gen))
    f.close()
Ejemplo n.º 3
0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# hyper parameter
embed_size = 128
hidden_size = 1024
num_layers = 2
num_epoches = 10
# number of words to be sampled
num_samples = 1000
batch_size = 20
seq_length = 30
learning_rate = 0.002

# loading data
corpus = Corpus()
ids = corpus.get_data("./data/train.txt",
                      batch_size)  # get sequence of sentence
vocab_size = len(corpus.dictionary)  # len(vocab)
num_batches = ids.size(1)


class RNNLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNLM, self).__init__()
        # intial the matrix
        self.embed_size = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            num_layers,
                            batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)