Ejemplo n.º 1
0
def build_vocab(args, tokenizer):
    vocab = collections.Counter()
    df = pd.read_csv(args.train_path, sep="\t")
    for i, row in df.iterrows():
        tokens = tokenizer(load_sent(row[0], -1))
        vocab.update(tokens)
    words = ['<pad>', '<unk>', '<bos>', '<eos>'] + list(sorted(vocab))

    return (words, {w: i for i, w in enumerate(words)})
Ejemplo n.º 2
0
def main(args):
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    log_file = os.path.join(args.save_dir, 'log.txt')
    logging(str(args), log_file)

    # Prepare data
    train_sents = load_sent(args.train)
    logging(
        '# train sents {}, tokens {}'.format(len(train_sents),
                                             sum(len(s) for s in train_sents)),
        log_file)
    valid_sents = load_sent(args.valid)
    logging(
        '# valid sents {}, tokens {}'.format(len(valid_sents),
                                             sum(len(s) for s in valid_sents)),
        log_file)
    vocab_file = os.path.join(args.save_dir, 'vocab.txt')

    # if not os.path.isfile(vocab_file):
    #     Vocab.build(train_sents, vocab_file, args.vocab_size)

    Vocab.build(train_sents, vocab_file, args.vocab_size)

    vocab = Vocab(vocab_file)
    logging('# vocab size {}'.format(vocab.size), log_file)

    set_seed(args.seed)
    cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if cuda else 'cpu')
    model = {
        'dae': DAE,
        'vae': VAE,
        'aae': AAE
    }[args.model_type](vocab, args).to(device)
    if args.load_model:
        ckpt = torch.load(args.load_model)
        model.load_state_dict(ckpt['model'])
        model.flatten()
    logging(
        '# model parameters: {}'.format(
            sum(x.data.nelement() for x in model.parameters())), log_file)

    train_batches, _ = get_batches(train_sents, vocab, args.batch_size, device)
    valid_batches, _ = get_batches(valid_sents, vocab, args.batch_size, device)
    best_val_loss = None
    for epoch in range(args.epochs):
        start_time = time.time()
        logging('-' * 80, log_file)
        model.train()
        meters = collections.defaultdict(lambda: AverageMeter())
        indices = list(range(len(train_batches)))
        random.shuffle(indices)
        for i, idx in enumerate(indices):
            inputs, targets = train_batches[idx]
            losses = model.autoenc(inputs, targets, is_train=True)
            losses['loss'] = model.loss(losses)
            model.step(losses)
            for k, v in losses.items():
                meters[k].update(v.item())

            if (i + 1) % args.log_interval == 0:
                log_output = '| epoch {:3d} | {:5d}/{:5d} batches |'.format(
                    epoch + 1, i + 1, len(indices))
                for k, meter in meters.items():
                    log_output += ' {} {:.2f},'.format(k, meter.avg)
                    meter.clear()
                logging(log_output, log_file)

        valid_meters = evaluate(model, valid_batches)
        logging('-' * 80, log_file)
        log_output = '| end of epoch {:3d} | time {:5.0f}s | valid'.format(
            epoch + 1,
            time.time() - start_time)
        for k, meter in valid_meters.items():
            log_output += ' {} {:.2f},'.format(k, meter.avg)
        if not best_val_loss or valid_meters['loss'].avg < best_val_loss:
            log_output += ' | saving model'
            ckpt = {'args': args, 'model': model.state_dict()}
            torch.save(ckpt, os.path.join(args.save_dir, 'model.pt'))
            best_val_loss = valid_meters['loss'].avg
        logging(log_output, log_file)
    logging('Done training', log_file)
Ejemplo n.º 3
0
def main(args):
    pl.seed_everything(args.seed)

    model = load_model(args.checkpoint).to(device)
    model.eval()
    vocab = Vocab(os.path.join(model.hparams.root_dir, 'vocab.txt'))

    if args.eval:
        data = load_data(args.eval, model.hparams.add_eos,
                         model.hparams.cat_sent, model.hparams.max_len)
        dl = get_eval_dataloader(data,
                                 vocab,
                                 args.max_tok,
                                 data_workers=args.data_workers,
                                 model_type=model.hparams.model_type)
        trainer = pl.Trainer(gpus=args.gpus,
                             amp_level=args.fp16_opt_level,
                             precision=16 if args.fp16 else 32,
                             default_root_dir='testing_logs')
        model.hparams.n_mc = args.n_mc
        trainer.test(model, test_dataloaders=dl)

    if args.output:
        output = os.path.join(
            os.path.dirname(os.path.dirname(args.checkpoint)), 'outputs/',
            args.output)
        makedir(output)

    if args.sample:
        with open(output, 'w') as f:
            for i in tqdm(range(args.sample)):
                if model.hparams.model_type == 'inst':
                    _, full = model.generate([], [0], args.decode, device)
                else:
                    _, full = model.generate([model.init_canvas()],
                                             args.decode, device)

                full = [[vocab.idx2word[id] for id in ids] for ids in full]
                write(f, full, args.write_mid)

    if args.fill:
        sents = load_sent(args.fill, model.hparams.add_eos)
        sents = [[vocab.word_to_idx(w) for w in s] for s in sents]
        with open(output + '.fill', 'w') as f_fill:
            with open(output + '.full', 'w') as f_full:
                for s in tqdm(sents):
                    if model.hparams.model_type == 'inst':
                        seq, blanks = [], []
                        for w in s:
                            if w == vocab.blank:
                                blanks.append(len(seq))
                            else:
                                seq.append(w)
                        if args.anywhere:
                            blanks = list(range(len(seq) + 1))
                        fill, full = model.generate(seq, blanks, args.decode,
                                                    device, args.force_insert,
                                                    args.prioritize_unfilled)
                    else:
                        fill, full = model.generate(s, args.decode, device)

                    fill = [[vocab.idx2word[id] for id in ids] for ids in fill]
                    full = [[vocab.idx2word[id] for id in ids] for ids in full]
                    write(f_fill, fill, args.write_mid)
                    write(f_full, full, args.write_mid)