Exemple #1
0
def main(args):
    if not Path(args.out_file).parent.exists():
        raise ValueError("Invalid out_file %s. Does the directory exist?" %
                         (args.out_file, ))

    corpus = Corpus(args.corpus_path)

    # Remove actual dataset -- just keep the vocabulary
    corpus.train = None
    corpus.valid = None
    corpus.test = None

    torch.save(corpus, args.out_file)
Exemple #2
0
 def train_model(self):
     args = self.args
     # Load data
     corpus = Corpus(args.file)
     train_data = train.batchify(corpus.train, args.batch_size, self.device)
     # Build the model
     ntokens = len(corpus.dictionary)
     model = RNNModel(args.model, ntokens, args.emsize, args.nhid,
                      args.nlayers, args.dropout, args.tied).to(self.device)
     # criterion = nn.NLLLoss()
     # criterion = nn.MSELoss()
     criterion = self.args.criterion
     optimizer = optim.Adam(model.parameters(), lr=args.lr)
     # Training code
     # Loop over epochs.
     lr = args.lr
     # At any point you can hit Ctrl + C to break out of training early.
     try:
         for epoch in range(1, args.epochs + 1):
             epoch_start_time = time.time()
             train.train(train_data, args, model, optimizer, criterion,
                         corpus, epoch, lr, self.device)
             print('-' * 89)
             with open(args.save, 'wb') as f:
                 torch.save(model, f)
             lr /= 4.0
     except KeyboardInterrupt:
         print('-' * 89)
         print('Exiting from training early')
     return model
Exemple #3
0
def main():
    """
    main function
    """
    ctx = mx.gpu(2)

    batch_size = 40
    bptt = 35

    corpus = Corpus('./data/ptb.')
    ntokens = len(corpus.dictionary)
    train_data = CorpusIter(corpus.train, batch_size, bptt)
    valid_data = CorpusIter(corpus.valid, batch_size, bptt)
    test_data = CorpusIter(corpus.test, batch_size, bptt)

    data_names = [x[0] for x in test_data.provide_data] 
    label_names = [x[0] for x in test_data.provide_label]

    prefix = './output/model'
    epoch = 39
    model = mx.module.Module.load(prefix, epoch, label_names=label_names, data_names=data_names, context=ctx)
    model.bind(for_training=False, data_shapes=test_data.provide_data, label_shapes=test_data.provide_label)

    for batch in test_data:
        model.forward(batch, is_train=False)
        print(model.get_outputs())
Exemple #4
0
def get_popular_first_words(args):
    corpus = Corpus(args.data_path)
    ntokens = len(corpus.dictionary)
    idx2word = corpus.dictionary.idx2word
    most_common_first_words_ids = [i[0] for i in Counter(corpus.train.tolist()).most_common()
                                   if idx2word[i[0]][0].isupper()][:args.utterances_to_generate]
    return[corpus.dictionary.idx2word[i] for i in most_common_first_words_ids]
Exemple #5
0
def plot(args):
    num_words = 1000

    # Set the random seed manually for reproducibility.
    torch.manual_seed(args.seed)

    # Load model.
    with open(args.checkpoint, 'rb') as f:
        try:
            model = torch.load(f)
        except:
            # Convert the model to CPU if the model is serialized on GPU.
            model = torch.load(f, map_location='cpu')
    model.eval()
    embeddings = model.embedding.weight.data

    # Load data.
    data_dir = os.path.expanduser(args.data_dir)
    corpus = Corpus(data_dir, headers=args.no_headers, lower=args.lower, chars=args.use_chars)
    ntokens = len(corpus.dictionary.w2i)

    # Some checks to see if data and model are consistent.
    model_data_checks(model, corpus, args)

    # Prepare embeddings from num_words most common words.
    most_common_idxs = Counter(corpus.train).most_common(num_words)
    most_common_idxs, _ = zip(*most_common_idxs)  # Discard counts
    most_common_words = [corpus.dictionary.i2w[i] for i in most_common_idxs]
    idxs = torch.LongTensor(most_common_idxs)
    embeddings = embeddings[idxs, :].numpy()

    # Make bokeh plot.
    emb_scatter(embeddings, most_common_words, model_name=args.name)
Exemple #6
0
def save_lmbmet_ds():
    """Instantiate a Corpus object, add vocabulary and encoded data"""
    logger.info('Saving LMBMET training data...')

    corpus = Corpus()

    # Identify all the tokens except MeSH descriptors, which are included in
    # vocab via mesh_def
    words_wo_mesh_d = []
    for w, _ in words.most_common():
        if w in mesh_def and mesh_def[w]['descriptor']:
            continue
        words_wo_mesh_d.append(w)
    corpus.vocab.load_vocab(words_wo_mesh_d,
                            mesh_def=mesh_def,
                            specials=['<eos>', '<unk>'])
    corpus.mesh_def = mesh_def

    corpus.load_data(docs)
    pickle.dump(corpus, lm_out_file.open("wb"), protocol=4)
Exemple #7
0
def main(args):
    data_dir = os.path.expanduser(args.data_dir)
    gold_path = os.path.expanduser(args.gold_path)

    corpus = Corpus(args.vocab_path, data_dir)
    model = torch.load(args.model_path)

    parser = Decoder(corpus, model)
    conll = parser.batch_eval()

    # Write the conll as text.
    conll.write(args.predict_path)
    # Evaluate the predicted conll.
    os.system('perl eval.pl -g {0} -s {1} > {2}'.format(
        gold_path, args.predict_path, args.result_path))
Exemple #8
0
def test(data_path, model_path, options_path, dict_path):

    with open(model_path, 'rb') as f:
        model = torch.load(f)

    with open(options_path, 'rb') as f:
        model_params = pkl.load(f)

    print "Load data..."
    corpus = Corpus(data_path)
    test_data = batchify(corpus.test, model_params['batch_size'])
    print "Done"

    PPL = evaluatePTB(test_data, model, model_params, corpus.dictionary)
    print 'test perplexity: ', PPL
Exemple #9
0
def main():
    set_random_seed(2020)
    show_device_info()

    data_path = Path('/media/bnu/data/nlp-practice/language-model')
    corpus = Corpus(data_path, sort_by_len=False)

    learner = LMLearner(corpus, n_embed=400, n_hidden=400, dropout=0.5,
                        rnn_type='LSTM', batch_size=128, learning_rate=1e-3)
    # 训练模型, 已经训练好进行错误分析的时候可以注释掉
    run(learner)

    test_loss, test_acc, test_words, test_result = learner.predict()
    print('Result in Test --> Loss: {:.3f}, Acc: {:.3f}, Words: {}'.format(test_loss, test_acc, test_words))

    show_test_sample(4, test_result, corpus)
    show_most_mistake(test_result, corpus)
Exemple #10
0
def main(args):
    print(f'Loading corpus from `{args.data}`...')
    corpus = Corpus(args.data,
                    order=args.order,
                    lower=args.lower,
                    max_lines=args.max_lines)
    model = Ngram(order=args.order)
    name = f'{args.name}.{args.order}gram'

    print('Example data:')
    print('Train:', corpus.train[:20])
    print('Valid:', corpus.valid[:20])

    print('Training model...')
    model.train(corpus.train,
                add_k=args.add_k,
                interpolate=args.interpolate,
                backoff=args.backoff)
    print(f'Vocab size: {len(model.vocab):,}')

    if args.save_arpa:
        print(f'Saving model to `{name}`...')
        model.save_arpa(name)

    assert model.sum_to_one(n=10)

    print('Generating text...')
    text = model.generate(100)
    text = ' '.join(text)
    path = os.path.join(args.out, f'generated.{name}.txt')
    print(text)
    with open(path, 'w') as f:
        print(text, file=f)

    if model.is_smoothed:
        print('\nPredicting test set NLL...')
        logprob = model(corpus.test)
        nll = -logprob / len(corpus.test)
        print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}')
        path = os.path.join(args.out, f'result.{name}.txt')
        with open(path, 'w') as f:
            print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}', file=f)
    else:
        exit(
            'No evaluation with unsmoothed model: probability is probably 0 anyways.'
        )
Exemple #11
0
def main(data_dir):
    print(f'Reading and processing data from `{data_dir}`...')
    corpus = Corpus(data_dir)

    print(f'Collecting ngram counts...')
    print('Unigram...')
    unigrams = get_unigrams(corpus.train)
    print('Bigram...')
    bigrams = get_ngrams(corpus.train, history=1)
    print('Trigram...')
    trigrams = get_ngrams(corpus.train, history=2)
    print('Fourgram...')
    fourgrams = get_ngrams(corpus.train, history=3)

    for i, gram in enumerate((unigrams, bigrams, trigrams, fourgrams), 1):
        with open(f'data/wikitext.{i}gram.json', 'w') as f:
            json.dump(gram, f, indent=4)

    print('Done.')
Exemple #12
0
def main(unused_argv):
    if len(unused_argv) != 1:
        raise Exception("There is a problem with how you entered flags: %s" % unused_argv)

    options, vocab, multisense_vocab, tf_config = init.init()
    model = polylm.PolyLM(
            vocab, options, multisense_vocab=multisense_vocab, training=True)
    test_words = options.test_words.split()

    if not os.path.exists(options.model_dir):
        os.makedirs(options.model_dir)
    
    src_dir = os.path.join(options.model_dir, 'src_%d' % int(time.time()))
    os.makedirs(src_dir)
    publish_source(src_dir)

    flags_str = options.flags_into_string()
    with open(os.path.join(options.model_dir, 'flags'), 'w') as f:
        f.write(flags_str)

    corpus = Corpus(options.corpus_path, vocab)
    with tf.Session(config=tf_config) as sess:
        model.attempt_restore(sess, options.model_dir, False)
        model.train(corpus, sess, test_words=test_words) 
Exemple #13
0
    return loss


if __name__ == '__main__':
    # args
    head = '%(asctime)-15s %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=head)
    args = parser.parse_args()
    logging.info(args)
    ctx = mx.gpu()
    batch_size = args.batch_size
    bptt = args.bptt
    mx.random.seed(args.seed)

    # data
    corpus = Corpus(args.data)
    ntokens = len(corpus.dictionary)
    train_data = CorpusIter(corpus.train, batch_size, bptt)
    valid_data = CorpusIter(corpus.valid, batch_size, bptt)
    test_data = CorpusIter(corpus.test, batch_size, bptt)

    # model
    pred, states, state_names = rnn(bptt, ntokens, args.emsize, args.nhid,
                                    args.nlayers, args.dropout, batch_size,
                                    args.tied)
    loss = softmax_ce_loss(pred)

    # module
    module = CustomStatefulModule(loss,
                                  states,
                                  state_names=state_names,
Exemple #14
0
def main():
    args = get_args()

    log.info(f'Parsed arguments: \n{pformat(args.__dict__)}')
    assert args.cond_type.lower() in ['none', 'platanios', 'oestling']

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    log.info('Using device {}.'.format(device))

    use_apex = False
    if torch.cuda.is_available() and args.fp16:
        log.info('Loading Nvidia Apex and using AMP')
        from apex import amp, optimizers
        use_apex = True
    else:
        log.info('Using FP32')
        amp = None

    log.info(f'Using time stamp {timestamp} to save models and logs.')

    if not args.no_seed:
        log.info(f'Setting random seed to {args.seed} for reproducibility.')
        torch.manual_seed(args.seed)
        random.seed(args.seed)

    data = Corpus(args.datadir)

    data_splits = [
        {
            'split': 'train',
            'languages': args.dev_langs + args.target_langs,
            'invert_include': True,
        },
        {
            'split': 'valid',
            'languages': args.dev_langs,
        },
        {
            'split': 'test',
            'languages': args.target_langs,
        },
    ]

    if args.refine:
        data_splits.append({
            'split': 'train_100',
            'languages': args.target_langs,
            'ignore_missing': True,
        })

    data_splits = data.make_datasets(data_splits, force_rebuild=args.rebuild)
    train_set, val_set, test_set = data_splits['train'], data_splits[
        'valid'], data_splits['test']
    dictionary = data_splits['dictionary']

    train_language_distr = get_sampling_probabilities(train_set, 1.0)
    train_set = Dataset(train_set,
                        batchsize=args.batchsize,
                        bptt=args.bptt,
                        reset_on_iter=True,
                        language_probabilities=train_language_distr)
    val_set = Dataset(val_set,
                      make_config=True,
                      batchsize=args.valid_batchsize,
                      bptt=args.bptt,
                      eval=True)
    test_set = Dataset(test_set,
                       make_config=True,
                       batchsize=args.test_batchsize,
                       bptt=args.bptt,
                       eval=True)

    train_loader = DataLoader(train_set, num_workers=args.workers)
    val_loader = DataLoader(val_set, num_workers=args.workers)
    test_loader = DataLoader(test_set, num_workers=args.workers)

    if args.refine:
        refine_set = dict()
        for lang, lang_d in data_splits['train_100'].items():
            refine_set[lang] = Dataset({lang: lang_d},
                                       batchsize=args.valid_batchsize,
                                       bptt=args.bptt,
                                       make_config=True)

    n_token = len(dictionary.idx2tkn)

    # Load and preprocess matrix of typological features
    # TODO: implement this, the OEST
    # prior_matrix = load_prior(args.prior, corpus.dictionary.lang2idx)
    # n_components = min(50, *prior_matrix.shape)
    # pca = PCA(n_components=n_components, whiten=True)
    # prior_matrix = pca.fit_transform(prior_matrix)
    prior = None

    model = RNN(args.cond_type,
                prior,
                n_token,
                n_input=args.emsize,
                n_hidden=args.nhidden,
                n_layers=args.nlayers,
                dropout=args.dropouto,
                dropoute=args.dropoute,
                dropouth=args.dropouth,
                dropouti=args.dropouti,
                wdrop=args.wdrop,
                wdrop_layers=[0, 1, 2],
                tie_weights=True).to(device)

    if args.opt_level != 'O2':
        loss_function = SplitCrossEntropyLoss(args.emsize,
                                              splits=[]).to(device)
    else:
        loss_function = CrossEntropyLoss().to(
            device)  # Should be ok to use with a vocabulary of this small size

    if use_apex:
        optimizer = optimizers.FusedAdam(model.parameters(),
                                         lr=args.lr,
                                         weight_decay=args.wdecay)
    else:
        params = list(filter(lambda p: p.requires_grad,
                             model.parameters())) + list(
                                 loss_function.parameters())
        optimizer = Adam(params, lr=args.lr, weight_decay=args.wdecay)

    if use_apex:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.opt_level)

    parameters = {
        'model': model,
        'optimizer': optimizer,
        'loss_function': loss_function,
        'use_apex': use_apex,
        'amp': amp if use_apex else None,
        'clip': args.clip,
        'alpha': args.alpha,
        'beta': args.beta,
        'bptt': args.bptt,
        'device': device,
        'prior': args.prior,
    }

    # Add backward hook for gradient clipping
    if args.clip:
        if use_apex:
            for p in amp.master_params(optimizer):
                p.register_hook(
                    lambda grad: torch.clamp(grad, -args.clip, args.clip))
        else:
            for p in model.parameters():
                p.register_hook(
                    lambda grad: torch.clamp(grad, -args.clip, args.clip))

    if args.prior == 'vi':
        prior = VIPrior(model, device=device)
        parameters['prior'] = prior

        def sample_weights(module: torch.nn.Module, input: torch.Tensor):
            prior.sample_weights(module)

        sample_weights_hook = model.register_forward_pre_hook(sample_weights)

    # Load model checkpoint if available
    start_epoch = 1
    if args.resume:
        if args.checkpoint is None:
            log.error(
                'No checkpoint passed. Specify it using the --checkpoint flag')
            checkpoint = None
        else:
            log.info('Loading the checkpoint at {}'.format(args.checkpoint))
            checkpoint = load_model(args.checkpoint, **parameters)

            start_epoch = checkpoint['epoch']

        if args.wdrop:
            for rnn in model.rnns:
                if isinstance(rnn, WeightDrop):
                    rnn.dropout = args.wdrop
                elif rnn.zoneout > 0:
                    rnn.zoneout = args.wdrop

    saved_models = list()

    result_str = '| Language {} | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}'

    def test():
        log.info('=' * 89)
        log.info('Running test set (zero-shot results)...')
        test_loss, avg_loss = evaluate(test_loader, **parameters)
        log.info('Test set finished | test loss {} | test bpc {}'.format(
            test_loss, test_loss / math.log(2)))

        for lang, avg_l_loss in avg_loss.items():
            langstr = dictionary.idx2lang[lang]
            log.info(
                result_str.format(langstr, avg_l_loss, math.exp(avg_l_loss),
                                  avg_l_loss / math.log(2)))

        log.info('=' * 89)

    if args.train:
        f = 1.
        stored_loss = 1e32
        epochs_no_improve = 0

        val_losses = list()

        # calculate specific language lr
        data_spec_count = sum([len(ds) for l, ds in train_set.data.items()])
        data_spec_avg = data_spec_count / len(train_set.data.items())
        data_spec_lrweights = dict([(l, data_spec_avg / len(ds))
                                    for l, ds in train_set.data.items()])

        # estimate total number of steps
        total_steps = sum(
            [len(ds) // args.bptt
             for l, ds in train_set.data.items()]) * args.no_epochs
        steps = 0

        try:
            pbar = tqdm.trange(start_epoch,
                               args.no_epochs + 1,
                               position=1,
                               dynamic_ncols=True)
            for epoch in pbar:

                steps = train(train_loader,
                              lr_weights=data_spec_lrweights,
                              **parameters,
                              total_steps=total_steps,
                              steps=steps,
                              scaling=args.scaling,
                              n_samples=args.n_samples,
                              tb_writer=tb_writer)

                val_loss, _ = evaluate(val_loader, **parameters)
                pbar.set_description('Epoch {} | Val loss {}'.format(
                    epoch, val_loss))

                # Save model
                if args.prior == 'vi':
                    sample_weights_hook.remove()

                filename = path.join(
                    args.checkpoint_dir, '{}_epoch{}{}_{}.pth'.format(
                        timestamp, epoch, '_with_apex' if use_apex else '',
                        args.prior))
                torch.save(make_checkpoint(epoch + 1, **parameters), filename)
                saved_models.append(filename)

                if args.prior == 'vi':
                    sample_weights_hook = model.register_forward_pre_hook(
                        sample_weights)

                # Early stopping
                if val_loss < stored_loss:
                    epochs_no_improve = 0
                    stored_loss = val_loss
                else:
                    epochs_no_improve += 1

                if epochs_no_improve == args.patience:
                    log.info('Early stopping at epoch {}'.format(epoch))
                    break

                val_losses.append(val_loss)

                # Reduce lr every 1/3 total epochs
                if epoch - 1 > f / 3 * args.no_epochs:
                    log.info('Epoch {}/{}. Dividing LR by 10'.format(
                        epoch, args.no_epochs))
                    for g in optimizer.param_groups:
                        g['lr'] = g['lr'] / 10

                    f += 1.
            test()
        except KeyboardInterrupt:
            log.info('Registered KeyboardInterrupt. Stopping training.')
            log.info('Saving last model to disk')

            if args.prior == 'vi':
                sample_weights_hook.remove()

            torch.save(
                make_checkpoint(epoch, **parameters),
                path.join(
                    args.checkpoint_dir, '{}_epoch{}{}_{}.pth'.format(
                        timestamp, epoch, '_with_apex' if use_apex else '',
                        args.prior)))
            return
    elif args.test:
        test()

    # Only test on existing languages if there are no held out languages
    if not args.target_langs:
        exit(0)

    importance = 1e-5

    # If use UNIV, calculate informed prior, else use boring prior
    if args.prior == 'laplace':
        if not isinstance(
                prior,
                LaplacePrior):  # only calculate matrix if it is not supplied.
            log.info('Creating laplace approximation dataset')
            laplace_set = Dataset(data_splits['train'],
                                  batchsize=args.batchsize,
                                  bptt=100,
                                  reset_on_iter=True)
            laplace_loader = DataLoader(laplace_set, num_workers=args.workers)
            log.info('Creating Laplacian prior')
            prior = LaplacePrior(model,
                                 loss_function,
                                 laplace_loader,
                                 use_apex=use_apex,
                                 amp=amp,
                                 device=device)
            parameters['prior'] = prior

            torch.save(
                make_checkpoint('fisher_matrix', **parameters),
                path.join(
                    args.checkpoint_dir, '{}_fishers_matrix{}_{}.pth'.format(
                        timestamp, '_with_apex' if use_apex else '',
                        args.prior)))
        importance = 1e5

    elif args.prior == 'ninf':
        log.info('Creating non-informative Gaussian prior')
        parameters['prior'] = GaussianPrior()
    elif args.prior == 'vi':
        importance = 1e-5
    elif args.prior == 'hmc':
        raise NotImplementedError
    else:
        raise ValueError(
            f'Passed prior {args.prior} is not an implemented inference technique.'
        )

    best_model = saved_models[-1] if not len(
        saved_models) == 0 else args.checkpoint

    # Remove sampling hook from model
    if args.prior == 'vi':
        sample_weights_hook.remove()

    # Refine on 100 samples on each target
    if args.refine:
        # reset learning rate
        optimizer.param_groups[0]['lr'] = args.lr
        loss = 0

        results = dict()

        # Create individual tests sets
        test_sets = dict()
        for lang, lang_d in data_splits['test'].items():
            test_sets[lang] = DataLoader(Dataset({lang: lang_d},
                                                 make_config=True,
                                                 batchsize=args.test_batchsize,
                                                 bptt=args.bptt,
                                                 eval=True),
                                         num_workers=args.workers)

        for lang, lang_data in tqdm.tqdm(refine_set.items()):
            final_loss = False
            refine_dataloader = DataLoader(lang_data, num_workers=args.workers)
            load_model(best_model, **parameters)

            log.info(f'Refining for language {dictionary.idx2lang[lang]}')
            for epoch in range(1, args.refine_epochs + 1):
                refine(refine_dataloader, **parameters, importance=importance)
                if epoch % 5 == 0:
                    final_loss = True
                    loss, avg_loss = evaluate(test_sets[lang],
                                              model,
                                              loss_function,
                                              only_l=lang,
                                              report_all=True,
                                              device=device)

                    for lang, avg_l_loss in avg_loss.items():
                        langstr = dictionary.idx2lang[lang]
                        log.debug(
                            result_str.format(langstr, avg_l_loss,
                                              math.exp(avg_l_loss),
                                              avg_l_loss / math.log(2)))

            if not final_loss:
                loss, avg_loss = evaluate(test_sets[lang],
                                          model,
                                          loss_function,
                                          only_l=lang,
                                          report_all=True,
                                          device=device)

            for lang, avg_l_loss in avg_loss.items():
                langstr = dictionary.idx2lang[lang]
                log.info(
                    result_str.format(langstr, avg_l_loss,
                                      math.exp(avg_l_loss),
                                      avg_l_loss / math.log(2)))
                results[lang] = avg_l_loss

        log.info('=' * 89)
        log.info('FINAL FEW SHOT RESULTS: ')
        log.info('=' * 89)
        for lang, avg_l_loss in results.items():
            langstr = dictionary.idx2lang[lang]
            log.info(
                result_str.format(langstr, avg_l_loss, math.exp(avg_l_loss),
                                  avg_l_loss / math.log(2)))
        log.info('=' * 89)
import os
from data import Corpus
import argparse
from model import RNNModel
import numpy as np


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-l', '--load_path', default=None)
    args = parser.parse_args()
    return args


cl_args = parse_args()
dataset = Corpus()
dataset.process_data()
sos = dataset.target_dict.word2idx['<sos>']
eos = dataset.target_dict.word2idx['<eos>']
args = np.load(os.path.join(cl_args.load_path, 'args.npy')).tolist()

model = RNNModel(args).cuda()
model.eval()
if cl_args.load_path:
    file = os.path.join(cl_args.load_path, 'model.pt')
    model.load_state_dict(torch.load(file))

itr = dataset.create_epoch_iterator('test', 1)
for i in xrange(50):
    source, target = itr.next()
    output = model.sample(source, sos, eos)
Exemple #16
0
BPTT = args.bptt
BSZ = args.bsz
EVAL_BSZ = 10
LR = args.lr
CLIP = args.clip
########################################################################################################################
PRINT_EVERY = args.log
CUDA = args.cuda
########################################################################################################################

# save decoders
DECODER = open("decoder.json", "w")
ENCODER = open("encoder.json", "w")

# read data
corpus = Corpus(args.data, CUDA)
vocab_size = len(corpus.dictionary)
print("|V|", vocab_size)

# turn into batches
training_data = batchify(corpus.train, BSZ, CUDA)
validation_data = batchify(corpus.valid, EVAL_BSZ, CUDA)

# set loss function
loss_function = nn.CrossEntropyLoss()

# Load the best saved model or initialize new one
if args.load:
    print('loading')
    with open(args.save, 'rb') as f:
        model = torch.load(f)
Exemple #17
0
def generate(args):
    cuda = torch.cuda.is_available()

    # Set the random seed manually for reproducibility.
    torch.manual_seed(args.seed)

    if args.temperature < 1e-3:
        parser.error("--temperature has to be greater or equal 1e-3")

    with open(args.checkpoint, 'rb') as f:
        try:
            model = torch.load(f)
        except:
            # Convert the model to CPU if the model is serialized on GPU.
            model = torch.load(f, map_location='cpu')
    model.eval()

    sos = SOS_CHAR if args.use_chars else SOS
    eos = EOS_CHAR if args.use_chars else EOS
    unk = UNK_CHAR if args.use_chars else UNK

    data_dir = os.path.expanduser(args.data_dir)
    corpus = Corpus(data_dir,
                    headers=args.no_headers,
                    lower=args.lower,
                    chars=args.use_chars)
    ntokens = len(corpus.dictionary)

    model_data_checks(model, corpus, args)

    if args.start:
        start = list(args.start) if args.use_chars else args.start.split()
        input = start = [word.lower()
                         for word in start] if args.lower else start
        if len(input) < model.order:
            input = (model.order - len(input)) * [sos] + input
        elif len(input) > model.order:
            input = input[-model.order:]
    else:
        start = input = [sos] * model.order
    input = [word if word in corpus.dictionary.w2i else unk for word in input]
    ids = [corpus.dictionary.w2i[word] for word in input]
    input = Variable(torch.LongTensor(ids).unsqueeze(0))
    input = input.cuda() if cuda else input

    glue = '' if args.use_chars else ' '
    with open(args.outf, 'w') as outf:
        if args.start:
            outf.write(glue.join(start) + glue)
        for i in range(args.num_samples):
            output = model(input)
            word_weights = output.squeeze().div(args.temperature).exp().cpu()
            if args.no_unk:
                word_weights[corpus.dictionary.w2i[unk]] = 0
            word_idx = torch.multinomial(word_weights, 1)[0]
            word_idx = word_idx.data[0]
            word = corpus.dictionary.i2w[word_idx]

            ids.append(word_idx)
            input = Variable(torch.LongTensor(ids[-model.order:]).unsqueeze(0))
            input = input.cuda() if cuda else input
            if word is sos and args.no_sos:
                continue
            elif word is eos:
                outf.write('\n')
            else:
                outf.write(word + glue)

            if i % 100 == 0:
                print('| Generated {}/{} words'.format(i, args.num_samples))

    print(f'Results saved in `{args.outf}`.')
Exemple #18
0
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from keras.models import load_model
from util import data_generator, generator_y_true
from data import Vocab, EmojiVocab, Corpus
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
import numpy as np

model = load_model(os.path.join('weight', args.model))

# load corpus and vocab
vocab = Vocab(20000)  # 20k
emoji_vocab = EmojiVocab(40)
corpus = Corpus(vocab, emoji_vocab, debug=False, eval=True)

encoded_test = corpus.encoded_test

# evaluation
y_pred = model.predict_generator(
    data_generator(encoded_test, args.batch_size, args.step_size,
                   len(emoji_vocab)),
    len(encoded_test[0]) // (args.batch_size * args.step_size),
    verbose=1)

target_names = [emoji_vocab.decode(x) for x in range(len(emoji_vocab))]
y_true = list(
    np.array(
        generator_y_true(encoded_test, args.batch_size, args.step_size,
                         len(emoji_vocab))).reshape(-1))
Exemple #19
0
 def load_corpus(self):
     self.corpus = Corpus(self.vocab, self.config.debug)
     self.encoded_train = np.array(self.corpus.encoded_train)
     self.encoded_dev = np.array(self.corpus.encoded_dev)
     self.encoded_test = np.array(self.corpus.encoded_test)
Exemple #20
0
# 参数设定
nepoch = 6
batch_size = 20
eval_batch_size = 10
bptt_len = 20
emsize = 200
nhid = 256
nlayers = 2
rnn_type = 'LSTM'
lr = 20
clip_coefficient = 0.5
cuda = True
print_every = 200

# 准备数据
corpus = Corpus('./data')
ntokens = len(corpus.dictionary)
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

# 建立模型
model = RNNModel(rnn_type, ntokens, emsize, nhid, nlayers)
if cuda:
    model.cuda()
loss_fn = nn.functional.cross_entropy

prev_val_loss = None
for epoch in range(1, nepoch + 1):
    epoch_start_time = time.time()
    train()
from data import Corpus
import os

path = "wikitext-2"
corpus = Corpus(path)
f = open(os.path.join(path, 'train_.txt'), 'w')
f.writelines(("%s\n" % t for t in corpus.train))
f = open(os.path.join(path, 'test_.txt'), 'w')
f.writelines(("%s\n" % t for t in corpus.test))
f = open(os.path.join(path, 'valid_.txt'), 'w')
f.writelines(("%s\n" % t for t in corpus.valid))

Exemple #22
0
def main():
    parser = argparse.ArgumentParser(description='Baseline RNN Language Model')
    parser.add_argument('--data',
                        type=str,
                        default='./data/',
                        help='location of the data corpus')
    parser.add_argument(
        '--test_path',
        type=str,
        default=None,
        help=
        'location of the test corpus to calculate word or character-level perplexity'
    )
    parser.add_argument(
        '--input',
        type=str,
        default='word',
        help='input level (word, grapheme, bpe, syllable, morfessor, char)')
    parser.add_argument(
        '--model',
        type=str,
        default='LSTM',
        help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)')
    parser.add_argument('--emsize',
                        type=int,
                        default=650,
                        help='size of word embeddings')
    parser.add_argument('--nhid',
                        type=int,
                        default=650,
                        help='number of hidden units per layer')
    parser.add_argument('--nlayers',
                        type=int,
                        default=2,
                        help='number of layers')
    parser.add_argument('--lr',
                        type=float,
                        default=1,
                        help='initial learning rate')
    parser.add_argument('--clip',
                        type=float,
                        default=10,
                        help='gradient clipping')
    parser.add_argument('--epochs',
                        type=int,
                        default=40,
                        help='upper epoch limit')
    parser.add_argument('--batch_size',
                        type=int,
                        default=20,
                        metavar='N',
                        help='batch size')
    parser.add_argument('--bptt', type=int, default=35, help='sequence length')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.5,
                        help='dropout applied to layers (0 = no dropout)')
    parser.add_argument('--seed', type=int, default=234, help='random seed')
    parser.add_argument('--cuda', action='store_true', help='use CUDA')
    parser.add_argument('--log-interval',
                        type=int,
                        default=200,
                        metavar='N',
                        help='report interval')
    parser.add_argument('--save',
                        type=str,
                        default='model.pt',
                        help='path to save the final model')
    parser.add_argument('--onnx_export',
                        type=str,
                        default='',
                        help='path to export the final model in onnx format')
    # =====
    parser.add_argument("--corr_type", type=str,
                        default="word")  # word, char, bpe
    # =====
    args = parser.parse_args()

    # Set the random seed manually for reproducibility.
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda"
            )

    device = torch.device("cuda" if args.cuda else "cpu")

    ###############################################################################
    # Load data
    ###############################################################################

    # corpus = reader.Corpus(args.data, args.input)
    corpus = Corpus(args.data)

    eval_batch_size = 10
    # train_data = batchify(corpus.train, args.batch_size, device)
    # val_data = batchify(corpus.valid, eval_batch_size, device)
    test_data = batchify(corpus.test, eval_batch_size, device)

    ###############################################################################
    # Evaluation code
    ###############################################################################

    # Load the best saved model.
    with open(args.save, 'rb') as f:
        if args.cuda:
            model = torch.load(f)
        else:
            model = torch.load(f, map_location='cpu')
        # after load the rnn params are not a continuous chunk of memory
        # this makes them a continuous chunk, and will speed up forward pass
        model.rnn.flatten_parameters()

    # Run on test
    import numpy as np
    correctness = get_argmax_correctness(args, model, test_data,
                                         eval_batch_size)
    idx2word = corpus.dictionary.idx2word
    orig_words = [
        idx2word[z] for z in test_data.t().contiguous().view(-1).numpy()
    ]
    assert len(orig_words) == len(correctness)
    my_eval(correctness, orig_words, args.corr_type)
Exemple #23
0
    args = p.parse_args()
    logging.basicConfig(level=logging.INFO)

    # Check whether GPU is present
    if args.enable_cuda and torch.cuda.is_available():
        enable_cuda = True
        torch.cuda.set_device(1)
        logging.info("CUDA is enabled")
    else:
        enable_cuda = False
        logging.info("CUDA is disabled")

    # Prepare corpus, encoder and decoder
    corpus = Corpus(args.english_train, args.french_train, args.batch_size,
                    args.num_symbols, args.min_count, args.lower,
                    args.enable_cuda)
    if args.enc_type.lower() == "transformer":
        encoder = TransformerEncoder(args.dim, corpus.vocab_size_e,
                                     corpus.max_pos, enable_cuda)
    else:
        encoder = Encoder(args.dim, corpus.vocab_size_e, corpus.max_pos,
                          args.enc_type, enable_cuda)
    valid = corpus.load_data(args.english_valid, args.french_valid)

    eos = corpus.dict_f.word2index["</s>"]
    decoder = Decoder(args.dim, corpus.vocab_size_f, eos,
                      corpus.longest_english, args.dec_type, args.attention)
    if enable_cuda:
        encoder.cuda()
        decoder.cuda()
Exemple #24
0
    hidden_size = cfg['model']['hidden_size']
    nlayers = cfg['model']['nlayers']
    batch_size = cfg['model']['batch_size']
    input_size = cfg['model']['input_size']
    
    epochs = cfg['model']['epochs']

    lr = float(cfg['model']['lr'])

    seq_len = cfg['model']['seq_len']
    #(self, C, nlayers, vocab_size, input_size, hidden_size, lr)

    corpus_loc = cfg['corpus_loc']
    print('test')
    print(corpus_loc)
    corpus = Corpus(corpus_loc)
    
    vocab_size = len(corpus.dict)


    train_batch_size = cfg['model']['train_batch_size']
    eval_batch_size = cfg['model']['eval_batch_size']
    test_batch_size = cfg['model']['test_batch_size']

    corpus_train = batchify(corpus.train, train_batch_size)
    corpus_valid = batchify(corpus.val, eval_batch_size)
    corpus_test = batchify(corpus.test, test_batch_size)
    
    batch_no = 0%corpus_train.shape[0]

    batch = get_batch(corpus_train, batch_no, train_batch_size)
Exemple #25
0
    heads = mst(S)

    # Predict labels
    select = torch.LongTensor(heads).unsqueeze(0).expand(S_lab.size(0), -1)
    select = Variable(select)
    selected = torch.gather(S_lab, 1, select.unsqueeze(1)).squeeze(1)
    _, labels = selected.max(dim=0)
    labels = labels.data.numpy()
    return heads, labels


if __name__ == '__main__':

    data_path = '../../stanford-ptb'
    vocab_path = 'vocab/train'
    model_path = 'models/model.pt'

    dictionary = Dictionary(vocab_path)
    corpus = Corpus(data_path=data_path, vocab_path=vocab_path)
    model = torch.load(model_path)
    batches = corpus.train.batches(1)

    words, tags, heads, labels = next(batches)
    S_arc, S_lab = model(words, tags)

    plot(S_arc, heads)
    words = tags = [1, 2, 3, 4]
    heads_pred, labels_pred = predict(model, words, tags)
    print(heads_pred, '\n', heads[0].data.numpy())
    print(labels_pred, '\n', labels[0].data.numpy())
Exemple #26
0
import numpy as np
from config import load_config


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', '--path', required=True)
    parser.add_argument('-s', '--save_path', default='./')
    parser.add_argument('-l', '--load_path', default=None)
    args = parser.parse_args()
    return args


args = parse_args()
cf = load_config(args.path)
dataset = Corpus()
dataset.process_data()

cf.ntokens_source = len(dataset.source_dict)
cf.ntokens_target = len(dataset.target_dict)

if not os.path.exists(args.save_path):
    os.makedirs(args.save_path)

criterion = nn.CrossEntropyLoss(
    ignore_index=dataset.target_dict.word2idx['<pad>'])

model = RNNModel(cf).cuda()
optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-4)

if args.load_path:
from data import Corpus


corpus = Corpus.load_from_folder("data/docs")
corpus.auto_tags()
corpus.save_conllu("todo/vi_corpus_v1_todo.conllu", write_status=True)

tagged_corpus = Corpus.load_from_conllu_file("vi_corpus_v1.conllu")

print(0)
# content = "\n\n".join(tokenizes)
# open("tmp/tokenize_data.txt", "w").write(content)
Exemple #28
0
# restoring model
savepath = params['filepath'].get('ckpt')
ckpt = torch.load(savepath)

vocab = ckpt['vocab']

model = SeNet(num_classes=params['num_classes'], vocab=vocab)
model.load_state_dict(ckpt['model_state_dict'])
model.eval()

# create dataset, dataloader
tagger = Okt()
padder = PadSequence(length=30)
tst_data = read_data(params['filepath'].get('tst'))
tst_data = remove_na(tst_data)
tst_dataset = Corpus(tst_data, vocab, tagger, padder)
tst_dataloader = DataLoader(tst_dataset, batch_size=128)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')
model.to(device)

# evaluation
correct_count = 0
for x_mb, y_mb in tqdm(tst_dataloader):
    x_mb = x_mb.to(device)
    y_mb = y_mb.to(device)
    with torch.no_grad():
        y_mb_hat = model(x_mb)
        y_mb_hat = torch.max(y_mb_hat, 1)[1]
        correct_count += (y_mb_hat == y_mb).sum().item()
weightDecay = args.wd

K = args.K

torch.set_num_threads(1)

torch.manual_seed(seed)
random.seed(seed)
torch.cuda.set_device(gpuId)
torch.cuda.manual_seed(seed)

corpus = Corpus(sourceTrainFile=sourceTrainFile,
                sourceOrigTrainFile=sourceOrigTrainFile,
                targetTrainFile=targetTrainFile,
                sourceDevFile=sourceDevFile,
                sourceOrigDevFile=sourceOrigDevFile,
                targetDevFile=targetDevFile,
                minFreqSource=minFreqSource,
                minFreqTarget=minFreqTarget,
                maxTokenLen=maxLen)

print('Source vocabulary size: ' + str(corpus.sourceVoc.size()))
print('Target vocabulary size: ' + str(corpus.targetVoc.size()))
print()
print('# of training samples: ' + str(len(corpus.trainData)))
print('# of develop samples:  ' + str(len(corpus.devData)))
print('Random seed: ', str(seed))

useSmallSoftmax = (K > 0 and K <= corpus.targetVoc.size())

if useSmallSoftmax:
Exemple #30
0
    torch.cuda.manual_seed(args.seed)

# Config to run
config = Config()
if os.path.isfile(args.save):
    checkpoint = torch.load(args.save)
    if 'config' in checkpoint:
        print("Loading saved config")
        config = checkpoint['config']
print(config)

# Dictionary and corpus
dictionary = Dictionary()
training_corpus = Corpus(args.data + "/train.txt",
                         dictionary,
                         create_dict=True,
                         use_cuda=args.cuda,
                         n_gram=config.n_gram,
                         context_mode=config.context_mode)
validation_corpus = Corpus(args.data + "/valid.txt",
                           dictionary,
                           create_dict=True,
                           use_cuda=args.cuda,
                           n_gram=config.n_gram,
                           context_mode=config.context_mode)

# TensorboardX object
writer = SummaryWriter("saved_runs/" + args.save)

# Word embeddings
embedding = nn.Embedding(len(dictionary), config.em_size, padding_idx=0)
if config.pre_trained: