Python read_corpus Exemples, nmt_utils.read_corpus Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : run_nmt_enes.py Projet : saadaosa/colab_runs

def decode(test_src_path,
           test_tgt_path=None,
           model_path='model.bin',
           beam_size=5,
           max_decoding=70,
           device='cpu',
           output_path='output.txt'):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
        If the target gold-standard sentences are given, the function also computes
        corpus-level BLEU score.
        Params:
            test_src_path (str): Path to the test source file
            test_tgt_path (str): Path to the test target file (optional). Default=None
            model_path (str): Path to the model file generated after training. Default='model.bin'
            beam_size (int): beam size (# of hypotheses to hold for a translation at every step)
            max_decoding (int): maximum sentence length that Beam search can produce. Default=70
            device (str): device to perform the calc on. Default = 'cpu'
            output_path (str): Path for the output file to write the results of the translation. Default='output.txt'
    """

    print(f'load test source sentences from [{test_src_path}]',
          file=sys.stderr)
    test_data_src = read_corpus(test_src_path, corpus_type='src')

    if test_tgt_path is not None:
        print(f'load test target sentences from [{test_tgt_path}]',
              file=sys.stderr)
        test_data_tgt = read_corpus(test_tgt_path, corpus_type='tgt')

    print(f'load model from {model_path}', file=sys.stderr)
    model = NMT.load(model_path)
    model = model.to(torch.device(device))

    hypotheses = beam_search(model,
                             test_data_src,
                             beam_size=beam_size,
                             max_decoding_time_step=max_decoding)

    if test_tgt_path is not None:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)
        print(f'Corpus BLEU: {bleu_score}', file=sys.stderr)

    with open(output_path, 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')

Exemple #2

0

Afficher le fichier

Fichier : nmt_vocab.py Projet : saadaosa/colab_runs

    """
    # train the spm model 
    spm.SentencePieceTrainer.train(input=file_path, model_prefix=corpus_type, vocab_size=vocab_size) 
    # create an instance; this saves .model and .vocab files 
    sp = spm.SentencePieceProcessor() 
    # loads tgt.model or src.model                                                              
    sp.load('{}.model'.format(corpus_type))                                                              
    sp_list = [sp.id_to_piece(piece_id) for piece_id in range(sp.get_piece_size())]
    return sp_list 

if __name__ == '__main__':
    args = docopt(__doc__)

    if args["--tok"] == 'nltk':
        print(f'read in source sentences: {args["--src"]}...')
        src_sents = read_corpus(args['--src'], corpus_type='src')
        
        print(f'read in target sentences: {args["--tgt"]}...')
        tgt_sents = read_corpus(args['--tgt'], corpus_type='tgt')

        print('generating vocab...')
        vocab = Vocab.build(src_sents, tgt_sents, int(args['--size']), int(args['--cutoff']))
        print(f'Vocab generated: {vocab}')
        
        jfile = f'{args["--f"]}_{args["--size"]}.json' 
    
    elif args["--tok"] == 'spm':
        print(f'read in source sentences: {args["--src"]}...')
        src_sents = get_vocab_list(args['--src'], corpus_type='src', vocab_size=args["--src-size"])  
        
        print(f'read in target sentences: {args["--tgt"]}...')

Exemple #3

0

Afficher le fichier

def decode(test_src_path,
           test_tgt_path=None,
           model_path='model.bin',
           tokenizer='nltk',
           spm_model_src='./spm/src.model',
           spm_model_tgt='./spm/tgt.model',
           beam_size=5,
           max_decoding=70,
           device='cpu',
           output_path='output.txt'):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
        If the target gold-standard sentences are given, the function also computes
        corpus-level BLEU score.
        Params:
            test_src_path (str): Path to the test source file
            test_tgt_path (str): Path to the test target file (optional). Default=None
            model_path (str): Path to the model file generated after training. Default='model.bin'
            tokenizer (str): Tokenizer used (nltk or spm). Default = nltk
            spm_model_src (str): Path to the source spm model. Default: ./spm/src.model
            spm_model_tgt (str): Path to the target spm model. Default: ./spm/tgt.model
            beam_size (int): beam size (# of hypotheses to hold for a translation at every step)
            max_decoding (int): maximum sentence length that Beam search can produce. Default=70
            device (str): device to perform the calc on. Default = 'cpu'
            output_path (str): Path for the output file to write the results of the translation. Default='output.txt'
    """

    print(f'load test source sentences from [{test_src_path}]',
          file=sys.stderr)
    if tokenizer == 'nltk':
        test_data_src = read_corpus(test_src_path, corpus_type='src')
    elif tokenizer == 'spm':
        test_data_src = read_corpus_spm(test_src_path,
                                        corpus_type='src',
                                        model_path=spm_model_src)
    else:
        raise Exception(
            f'unrecognised tokenizer {tokenizer}. Should be nltk or spm')

    if test_tgt_path is not None:
        print(f'load test target sentences from [{test_tgt_path}]',
              file=sys.stderr)
        if tokenizer == 'nltk':
            test_data_tgt = read_corpus(test_tgt_path, corpus_type='tgt')
        else:  #spm
            test_data_tgt = read_corpus_spm(test_tgt_path,
                                            corpus_type='tgt',
                                            model_path=spm_model_tgt)

    print(f'load model from {model_path}', file=sys.stderr)
    model = NMT.load(model_path)
    model = model.to(torch.device(device))

    hypotheses = beam_search(model,
                             test_data_src,
                             beam_size=beam_size,
                             max_decoding_time_step=max_decoding)
    #print(hypotheses)
    if test_tgt_path is not None:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)
        print(f'Corpus BLEU: {bleu_score}', file=sys.stderr)

    with open(output_path, 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            if tokenizer == 'nltk':
                hyp_sent = ' '.join(top_hyp.value)
            else:  #spm
                hyp_sent = ''.join(top_hyp.value).replace('▁', ' ')
            f.write(hyp_sent + '\n')

Exemple #4

0

Afficher le fichier

def load_data_train(train_src_path,
                    train_tgt_path,
                    dev_src_path,
                    dev_tgt_path,
                    tokenizer='nltk',
                    spm_model_src='./spm/src.model',
                    spm_model_tgt='./spm/tgt.model',
                    create_vocab=False,
                    vocab_path=None,
                    vocab_size=50000,
                    vocab_cutoff=2,
                    subset=1.0,
                    random_subset=False):
    """ Load dataset used by the NMT model
        Params:
            train_src_path (str): Path to the source sentences for training
            train_tgt_path (str): Path to the target sentences for training
            dev_src_path (str): Path to the source sentences for dev
            dev_tgt_path (str): Path to the target sentences for dev
            tokenizer (str): Tokenizer used (nltk or spm). Default = nltk
            spm_model_src (str): Path to the source spm model. Default: ./spm/src.model
            spm_model_tgt (str): Path to the target spm model. Default: ./spm/tgt.model
            create_vocab (bool): If True, vocab will be created otherwise loaded from vocab_path (works with nltk tokenizer)
                Default: False
            vocab_path (str): Path to the json file with  Vocab. Default = None
            vocab_size (int): Size of vocabulary for both source and target languages. Default = 50000
            vocab_cutoff (int): if word occurs n < freq_cutoff times, drop the word. Default = 2
            subset (float): Percentage to apply to the train and dev sets in order to load a subset of the data. 
                Subset is a number > 0 and <= 1. Default = 1
            random_subset (bool): if True the data subset is random otherwise the first requierd elements of the data.
                Default = False  
        Return:
            train_data (list of (src_sent, tgt_sent)): list of tuples containing source and target 
                sentences for training
            dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target 
                sentences for dev
            vocab (Vocab): Vocab object for source and target
    """
    # read all data
    if tokenizer == 'nltk':
        train_data_src = read_corpus(train_src_path, corpus_type='src')
        train_data_tgt = read_corpus(train_tgt_path, corpus_type='tgt')
        dev_data_src = read_corpus(dev_src_path, corpus_type='src')
        dev_data_tgt = read_corpus(dev_tgt_path, corpus_type='tgt')
    elif tokenizer == 'spm':
        train_data_src = read_corpus_spm(train_src_path,
                                         corpus_type='src',
                                         model_path=spm_model_src)
        train_data_tgt = read_corpus_spm(train_tgt_path,
                                         corpus_type='tgt',
                                         model_path=spm_model_tgt)
        dev_data_src = read_corpus_spm(dev_src_path,
                                       corpus_type='src',
                                       model_path=spm_model_src)
        dev_data_tgt = read_corpus_spm(dev_tgt_path,
                                       corpus_type='tgt',
                                       model_path=spm_model_tgt)

    else:
        raise ValueError(
            f'Incorrect value [{tokenizer}] for tokenizer; should be nltk or spm'
        )

    if subset == 1:
        train_data = list(zip(train_data_src, train_data_tgt))
        dev_data = list(zip(dev_data_src, dev_data_tgt))

    elif subset > 0 and subset < 1:
        num_train = int(subset * len(train_data_src))
        num_dev = int(subset * len(dev_data_src))
        if random_subset:
            train = list(zip(train_data_src, train_data_tgt))
            dev = list(zip(dev_data_src, dev_data_tgt))
            random.shuffle(train)
            random.shuffle(dev)
            train_data_src, train_data_tgt = zip(*train)
            dev_data_src, dev_data_tgt = zip(*dev)
        train_data = list(
            zip(train_data_src[:num_train], train_data_tgt[:num_train]))
        dev_data = list(zip(dev_data_src[:num_dev], dev_data_tgt[:num_dev]))

    else:
        raise ValueError(
            f'Incorrect value [{subset}] for subset; should be 0 < subset <=1')

    if create_vocab and tokenizer == 'nltk':
        src_sents, tgt_sents = zip(*train_data)
        vocab = Vocab.build(src_sents, tgt_sents, vocab_size, vocab_cutoff)
    else:
        if vocab_path is not None:
            vocab = Vocab.load(vocab_path)
        else:
            raise ValueError(
                f'Incorrect combination {tokenizer}, {create_vocab}, {vocab_path}'
            )

    return train_data, dev_data, vocab

Exemple #5

0

Afficher le fichier

Fichier : run_nmt_enes.py Projet : saadaosa/colab_runs

def trainOld(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], corpus_type='src')
    train_data_tgt = read_corpus(args['--train-tgt'], corpus_type='tgt')

    dev_data_src = read_corpus(args['--dev-src'], corpus_type='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], corpus_type='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab)
    model.train()

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' %
              (uniform_init, uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    while True:
        epoch += 1

        for src_sents, tgt_sents in batch_iter(train_data,
                                               batch_size=train_batch_size,
                                               shuffle=True):
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(src_sents)

            example_losses = -model(src_sents, tgt_sents)  # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_examples,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(
                    model, dev_data,
                    batch_size=128)  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' %
                      (train_iter, dev_ppl),
                      file=sys.stderr)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(
                            args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

                if epoch == int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)

Exemple #6

0

Afficher le fichier

Fichier : run_nmt_enes.py Projet : saadaosa/colab_runs

def load_data_train(train_src_path,
                    train_tgt_path,
                    dev_src_path,
                    dev_tgt_path,
                    create_vocab=False,
                    vocab_path=None,
                    vocab_size=50000,
                    vocab_cutoff=2,
                    subset=1.0,
                    random_subset=False):
    """ Load dataset used by the NMT model
        Params:
            train_src_path (str): Path to the source sentences for training
            train_tgt_path (str): Path to the target sentences for training
            dev_src_path (str): Path to the source sentences for dev
            dev_tgt_path (str): Path to the target sentences for dev
            create_vocab (bool): If True, vocab will be created otherwise loaded from vocab_path
            vocab_path (str): Path to the json file with Vocab
            vocab_size (int): Size of vocabulary for both source and target languages. Default = 50000
            vocab_cutoff (int): if word occurs n < freq_cutoff times, drop the word. Default = 2
            subset (float): Percentage to apply to the train and dev sets in order to load a subset of the data. 
                Subset is a number > 0 and <= 1. Default = 1
            random_subset (bool): if True the data subset is random otherwise the first requierd elements of the data.
                Default = False  
        Return:
            train_data (list of (src_sent, tgt_sent)): list of tuples containing source and target 
                sentences for training
            dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target 
                sentences for dev
            vocab (Vocab): Vocab object for source and target
    """
    # read all data
    train_data_src = read_corpus(train_src_path, corpus_type='src')
    train_data_tgt = read_corpus(train_tgt_path, corpus_type='tgt')
    dev_data_src = read_corpus(dev_src_path, corpus_type='src')
    dev_data_tgt = read_corpus(dev_tgt_path, corpus_type='tgt')

    if subset == 1:
        train_data = list(zip(train_data_src, train_data_tgt))
        dev_data = list(zip(dev_data_src, dev_data_tgt))

    elif subset > 0 and subset < 1:
        num_train = int(subset * len(train_data_src))
        num_dev = int(subset * len(dev_data_src))
        if random_subset:
            train = list(zip(train_data_src, train_data_tgt))
            dev = list(zip(dev_data_src, dev_data_tgt))
            random.shuffle(train)
            random.shuffle(dev)
            train_data_src, train_data_tgt = zip(*train)
            dev_data_src, dev_data_tgt = zip(*dev)
        train_data = list(
            zip(train_data_src[:num_train], train_data_tgt[:num_train]))
        dev_data = list(zip(dev_data_src[:num_dev], dev_data_tgt[:num_dev]))

    else:
        raise ValueError(
            f'Incorrect value [{subset}] for subset; should be 0 < subset <=1')

    if create_vocab:
        src_sents, tgt_sents = zip(*train_data)
        vocab = Vocab.build(src_sents, tgt_sents, vocab_size, vocab_cutoff)
    else:
        if vocab_path is not None:
            vocab = Vocab.load(vocab_path)
        else:
            raise ValueError(f'Vocab path is None and create_vocab is False')

    return train_data, dev_data, vocab