Example #1
0
        def build_embedding(dictionary, embed_dim, is_encoder, path=None):

            if path is not None:
                if path.startswith('elmo:'):
                    lm_path = path[5:]
                    task = LanguageModelingTask(args, dictionary, dictionary)
                    models, _ = utils.load_ensemble_for_inference(
                        [lm_path], task, {'remove_head': True})
                    assert len(
                        models
                    ) == 1, 'ensembles are currently not supported for elmo embeddings'

                    embedder = ElmoTokenEmbedder(
                        models[0],
                        dictionary.eos(),
                        dictionary.pad(),
                        add_bos=is_encoder,
                        remove_bos=is_encoder,
                        combine_tower_states=is_encoder,
                        projection_dim=embed_dim,
                        add_final_predictive=is_encoder,
                        add_final_context=is_encoder)
                    return embedder, 1
                elif path.startswith('bilm:'):
                    lm_path = path[5:]
                    task = LanguageModelingTask(args, dictionary, dictionary)
                    models, _ = utils.load_ensemble_for_inference(
                        [lm_path], task, {
                            'remove_head': True,
                            'dropout': args.bilm_model_dropout,
                            'attention_dropout': args.bilm_attention_dropout,
                            'relu_dropout': args.bilm_relu_dropout,
                        })
                    assert len(
                        models
                    ) == 1, 'ensembles are currently not supported for elmo embeddings'

                    return BILMEmbedder(models[0], args, args.encoder_embed_dim) if is_encoder \
                        else LMEmbedder(models[0], args.decoder_embed_dim)

            num_embeddings = len(dictionary)
            padding_idx = dictionary.pad()
            emb = nn.Embedding(num_embeddings, embed_dim, padding_idx)
            # if provided, load from preloaded dictionaries
            if path:
                embed_dict = utils.parse_embedding(path)
                utils.load_embedding(embed_dict, dictionary, emb)
            return emb
Example #2
0
    def __init__(self, parsed_args):
        self.args = parsed_args
        import_user_module(parsed_args)
        assert parsed_args.path is not None, '--path required for evaluation'

        #print(parsed_args)

        self.use_cuda = torch.cuda.is_available() and not parsed_args.cpu

        self.task = tasks.setup_task(parsed_args)

        # Load ensemble
        print('| loading model(s) from {}'.format(parsed_args.path))
        self.models, args = utils.load_ensemble_for_inference(
            parsed_args.path.split(':'), self.task, model_arg_overrides=eval(parsed_args.model_overrides),
        )
        
        print(self.models)

        for model in self.models:
            model.make_generation_fast_()
            if self.use_cuda:
                model.cuda()

        for arg in vars(parsed_args).keys():
            if arg not in {'self_target', 'future_target', 'past_target', 'tokens_per_sample',
                           'output_size_dictionary'}:
                setattr(args, arg, getattr(parsed_args, arg))
        self.task = tasks.setup_task(args)

        self.gen_timer = StopwatchMeter()
        self.scorer = SequenceScorer(self.task.target_dictionary)
Example #3
0
def generate_score(args, task, dataset, lang_pair=None):
    """
    Generation for single and multi model training

    Args:
        args: Command-line arguments.
        task: FairseqTask object.
        dataset: Dataset set object for a specific split for a specific model
        lang_pair: Model key in a multi model object. Specify None in single
            model set up
    """
    models, _ = utils.load_ensemble_for_inference(args.path.split(":"), task)
    if lang_pair and len(models) > 0 and isinstance(models[0],
                                                    FairseqMultiModel):
        return _generate_score(
            models=[multi_model.models[lang_pair] for multi_model in models],
            args=args,
            task=task,
            dataset=dataset,
        )
    else:
        return _generate_score(models=models,
                               args=args,
                               task=task,
                               dataset=dataset)
Example #4
0
 def load_model(self, args):
     use_cuda = torch.cuda.is_available() and not args.cpu
     task = tasks.setup_task(args)
     logger.info('loading edit model from {}'.format(args.path))
     models, model_args = utils.load_ensemble_for_inference(
         args.path.split(':'), task)
     return task, models[0], model_args
Example #5
0
    def build_model(cls, args, task):
        """Build a new model instance."""

        # make sure all arguments are present in older models
        base_architecture(args)

        if not hasattr(args, 'max_source_positions'):
            args.max_source_positions = 512
        if not hasattr(args, 'max_target_positions'):
            args.max_target_positions = 512

        dictionary = task.source_dictionary
        assert args.bert_path is not None
        args.short_seq_prob = 0.0
        task = BertTask(args, dictionary)
        #HACK_PATH = "/checkpoint/wangalexc/fairseq/bert-pretrained/20190520/checkpoint_best.pt"
        HACK_PATH = "/misc/vlgscratch4/BowmanGroup/awang/ckpts/fairseq/bert/best_pretrained_bert.pt"
        models, _ = utils.load_ensemble_for_inference([HACK_PATH], task,
                                                      {'save_masks': False})
        #models, _ = utils.load_ensemble_for_inference([args.bert_path], task, {'save_masks' : False})
        assert len(
            models
        ) == 1, 'ensembles are currently not supported for elmo embeddings'
        model = models[0]
        return FTSummerization(args, model)
Example #6
0
def load_fairseq_lm_model_and_dict(checkpoint_path, data_path):
    # Initialize model
    parser = options.get_eval_lm_parser()
    parsed_args = options.parse_args_and_arch(parser, ['--path', checkpoint_path, data_path])
    task = tasks.setup_task(parsed_args)
    models, _ = utils.load_ensemble_for_inference([checkpoint_path], task)
    return models[0], task.dictionary
Example #7
0
def main(args):
    assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.raw_text, \
        '--replace-unk requires a raw text dataset (--raw-text)'

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    if not args.quiet:
        print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    task = tasks.setup_task(args)

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task)

    # Optimize ensemble for generation
    for i, model in enumerate(models):
        models[i].make_generation_fast_(beamable_mm_beam_size=None if args.no_beamable_mm else args.beam)
        if args.fp16:
            models[i].half()

    if args.decode_source_file is not None:
        print('| [decode] decode from file')
        decode_from_file(models, task, args, use_cuda)
    else:
        print('| [decode] decode from dataset')
        decode_from_dataset(models, task, args, use_cuda)
Example #8
0
def main(parsed_args):
    assert parsed_args.path is not None, '--path required for evaluation!'

    print(parsed_args)

    use_cuda = torch.cuda.is_available() and not parsed_args.cpu

    task = tasks.setup_task(parsed_args)

    # Load ensemble
    print('| loading model(s) from {}'.format(parsed_args.path))
    models, args = utils.load_ensemble_for_inference(
        parsed_args.path.split(':'), task)

    assert len(models) == 1

    model = models[0]
    if use_cuda:
        model.cuda()

    for arg in vars(parsed_args).keys():
        setattr(args, arg, getattr(parsed_args, arg))
    task = tasks.setup_task(args)

    # Load dataset splits
    task.load_dataset(args.gen_subset)
    print('| {} {} {} examples'.format(args.data, args.gen_subset,
                                       len(task.dataset(args.gen_subset))))

    eval_dataset(task, model, task.dataset(args.gen_subset), args.out_file,
                 args.thresholds, args.compute_metrics, use_cuda)
Example #9
0
def generate_score(args, dataset, dataset_split):
    models, _ = utils.load_ensemble_for_inference(
        args.path,
        dataset.src_dict,
        dataset.dst_dict,
    )
    return _generate_score(models, args, dataset, dataset_split)
Example #10
0
    def __init__(self, args):
        super().__init__()

        fairseq_pretrained_model = str(args.data) + "/" + str(
            args.fairseq_model_name)
        pre_task = tasks.setup_task(args)
        print('| loading model from {}'.format(fairseq_pretrained_model))
        self.dict_file = fairseq_pretrained_model + "/dict.txt"
        models, self.model_args = utils.load_ensemble_for_inference(
            [fairseq_pretrained_model], pre_task)

        copy_args(args, self.model_args)

        self.task = tasks.setup_task(self.model_args)
        self.model = models[0]

        self.decoder = self.model.decoder  # <class 'fairseq.models.fconv.FConvDecoder'>

        # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer)
        self.model.make_generation_fast_()

        self.map_indices = None
        self.pad = self.task.target_dictionary.pad

        # add MASK to dictionary
        self.mask_id = self.task.dictionary.add_symbol(MASK)

        self.vocab = self.task.dictionary.symbols[:]

        # reinitialize inverse vocab
        self._init_inverse_vocab()
        self.unk_index = self.inverse_vocab[FAIRSEQ_UNK]
def setup_model(args):
    import_user_module(args)

    if args.buffer_size < 1:
        args.buffer_size = 1
    if args.max_tokens is None and args.max_sentences is None:
        args.max_sentences = 1

    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert not args.max_sentences or args.max_sentences <= args.buffer_size, \
        '--max-sentences/--batch-size cannot be larger than --buffer-size'

    logger.info('fairseq args: {}'.format(args))

    # Setup task, e.g., translation
    task = tasks.setup_task(args)

    # Load ensemble
    logger.info('| loading model(s) from {}'.format(args.path))
    models, _model_args = utils.load_ensemble_for_inference(
        args.path.split(':'),
        task,
        model_arg_overrides=eval(args.model_overrides),
    )

    # Set dictionary
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )

    translator = SequenceGenerator(
        models,
        tgt_dict,
        beam_size=args.beam,
        minlen=args.min_len,
        stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized),
        len_penalty=args.lenpen,
        unk_penalty=args.unkpen,
        sampling=args.sampling,
        sampling_topk=args.sampling_topk,
        sampling_temperature=args.sampling_temperature,
        diverse_beam_groups=args.diverse_beam_groups,
        diverse_beam_strength=args.diverse_beam_strength,
        match_source_len=args.match_source_len,
        no_repeat_ngram_size=args.no_repeat_ngram_size,
    )

    if torch.cuda.is_available() and not args.cpu:
        translator.cuda()

    logger.info('model has been read successfully!')
    return models, task, tgt_dict, translator
Example #12
0
def main(args):
    assert args.path is not None, '--path required for evaluation!'

    args.tokens_per_sample = getattr(args, 'tokens_per_sample', 1024)
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)
    print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset))))

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task)

    # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer)
    for model in models:
        model.make_generation_fast_()
        if args.fp16:
            model.half()

    itr = data.EpochBatchIterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences or 4,
        max_positions=model.max_positions(),
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        ignore_invalid_inputs=True,
    ).next_epoch_itr(shuffle=False)

    gen_timer = StopwatchMeter()
    scorer = SequenceScorer(models, task.target_dictionary)
    if use_cuda:
        scorer.cuda()

    score_sum = 0.
    count = 0
    with progress_bar.build_progress_bar(args, itr) as t:
        results = scorer.score_batched_itr(t, cuda=use_cuda, timer=gen_timer)
        wps_meter = TimeMeter()
        for _, src_tokens, __, hypos in results:
            for hypo in hypos:
                pos_scores = hypo['positional_scores']
                inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq(float('-inf'))
                if inf_scores.any():
                    print('| Skipping tokens with inf scores:',
                          task.target_dictionary.string(hypo['tokens'][inf_scores.nonzero()]))
                    pos_scores = pos_scores[(~inf_scores).nonzero()]
                score_sum += pos_scores.sum()
                count += pos_scores.numel()
            wps_meter.update(src_tokens.size(0))
            t.log({'wps': round(wps_meter.avg)})

    avg_nll_loss = -score_sum / count
    print('| Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format(gen_timer.n, gen_timer.sum, 1. / gen_timer.avg))
    print('| Loss: {:.4f}, Perplexity: {:.2f}'.format(avg_nll_loss, np.exp(avg_nll_loss)))
Example #13
0
def main():
    parser = options.get_parser('Generation')
    parser.add_argument('--path', metavar='FILE', required=True, action='append',
                        help='path(s) to model file(s)')
    options.add_dataset_args(parser)
    options.add_generation_args(parser)

    args = parser.parse_args()
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load ensemble
    print('| loading model(s) from {}'.format(', '.join(args.path)))
    models, model_args = utils.load_ensemble_for_inference(args.path, data_dir=args.data)
    src_dict, dst_dict = models[0].src_dict, models[0].dst_dict

    print('| [{}] dictionary: {} types'.format(model_args.source_lang, len(src_dict)))
    print('| [{}] dictionary: {} types'.format(model_args.target_lang, len(dst_dict)))

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam)

    # Initialize generator
    translator = SequenceGenerator(
        models, beam_size=args.beam, stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized), len_penalty=args.lenpen,
        unk_penalty=args.unkpen)
    if use_cuda:
        translator.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    print('| Type the input sentence and press return:')
    for src_str in sys.stdin:
        src_str = src_str.strip()
        src_tokens = tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long()
        if use_cuda:
            src_tokens = src_tokens.cuda()
        translations = translator.generate(Variable(src_tokens.view(1, -1)))
        hypos = translations[0]
        print('O\t{}'.format(src_str))

        # Process top predictions
        for hypo in hypos[:min(len(hypos), args.nbest)]:
            hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                hypo_tokens=hypo['tokens'].int().cpu(),
                src_str=src_str,
                alignment=hypo['alignment'].int().cpu(),
                align_dict=align_dict,
                dst_dict=dst_dict,
                remove_bpe=args.remove_bpe)
            print('H\t{}\t{}'.format(hypo['score'], hypo_str))
            print('A\t{}'.format(' '.join(map(str, alignment))))
Example #14
0
def main(args):
    check_args(args)
    import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 30000
    logger.info(args)

    #use_cuda = torch.cuda.is_available() and not args.cpu
    # use_cuda = False

    # Load dataset splits
    task = tasks.setup_task(args)

    # Set dictionary
    tgt_dict = task.target_dictionary

    if args.ctc or args.rnnt:
        tgt_dict.add_symbol("<ctc_blank>")
        if args.ctc:
            logger.info("| decoding a ctc model")
        if args.rnnt:
            logger.info("| decoding a rnnt model")

    # Load ensemble
    logger.info("| loading model(s) from {}".format(args.path))
    models, _model_args = utils.load_ensemble_for_inference(
        args.path.split(":"),
        task,
        model_arg_overrides=eval(args.model_overrides),  # noqa
    )
    optimize_models(args, models)

    # Initialize generator
    generator = task.build_generator(args)

    sp = spm.SentencePieceProcessor()
    sp.Load(os.path.join(args.data, 'spm.model'))

    # TODO: replace this
    # path = '/Users/jamarshon/Downloads/snippet.mp3'
    # path = '/Users/jamarshon/Downloads/hamlet.mp3'
    path = '/home/aakashns/speech_transcribe/deepspeech.pytorch/data/an4_dataset/train/an4/wav/cen8-mwhw-b.wav'
    if not os.path.exists(path):
        raise FileNotFoundError("Audio file not found: {}".format(path))
    waveform, sample_rate = torchaudio.load_wav(path)
    waveform = waveform.mean(0, True)
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate,
                                              new_freq=16000)(waveform)
    # waveform = waveform[:, :16000*30]
    # torchaudio.save('/Users/jamarshon/Downloads/hello.wav', waveform >> 16, 16000)
    import time
    print(sample_rate, waveform.shape)
    start = time.time()
    transcribe(waveform, args, task, generator, models, sp, tgt_dict)
    end = time.time()
    print(end - start)
Example #15
0
        def build_embedding(dictionary, embed_dim, path):

            assert path is not None
            args.short_seq_prob = 0.0
            task = BertTask(args, dictionary)
            models, _ = utils.load_ensemble_for_inference([path], task, {'remove_head': True, 'remove_pooled' : True, 'save_masks' : False})
            assert len(models) == 1, 'ensembles are currently not supported for elmo embeddings'
            pretrain_model = models[0]
            return pretrain_model
Example #16
0
 def load_model(cls, path, cpu=False):
     args = argparse.Namespace(data=os.path.dirname(path), path=path, cpu=cpu, task='language_modeling',
             output_dictionary_size=-1, self_target=False, future_target=False, past_target=False)
     use_cuda = torch.cuda.is_available() and not cpu
     task = tasks.setup_task(args)
     models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task)
     d = task.target_dictionary
     scorer = SequenceScorer(models, d)
     return cls(task, scorer, use_cuda)
Example #17
0
 def load_model(self, args):
     #args = argparse.Namespace(data=data_path, path=model_path, cpu=cpu, task='edit')
     use_cuda = torch.cuda.is_available() and not args.cpu
     task = tasks.setup_task(args)
     logger.info('loading model from {}'.format(args.path))
     overrides = {'encoder_embed_path': None, 'decoder_embed_path': None}
     models, model_args = utils.load_ensemble_for_inference(
         args.path.split(':'), task, overrides)
     return task, models[0], model_args
Example #18
0
def main(args):
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load ensemble
    print('| loading model(s) from {}'.format(', '.join(args.path)))
    models, model_args = utils.load_ensemble_for_inference(args.path, data_dir=args.data)
    src_dict, dst_dict = models[0].src_dict, models[0].dst_dict

    print('| [{}] dictionary: {} types'.format(model_args.source_lang, len(src_dict)))
    print('| [{}] dictionary: {} types'.format(model_args.target_lang, len(dst_dict)))

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
        )

    # Initialize generator
    translator = SequenceGenerator(
        models, beam_size=args.beam, stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized), len_penalty=args.lenpen,
        unk_penalty=args.unkpen)
    if use_cuda:
        translator.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    print('| Type the input sentence and press return:')
    for src_str in sys.stdin:
        src_str = src_str.strip()
        src_tokens = tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long()
        if use_cuda:
            src_tokens = src_tokens.cuda()
        src_lengths = src_tokens.new([src_tokens.numel()])
        translations = translator.generate(
            Variable(src_tokens.view(1, -1)),
            Variable(src_lengths.view(-1)),
        )
        hypos = translations[0]
        print('O\t{}'.format(src_str))

        # Process top predictions
        for hypo in hypos[:min(len(hypos), args.nbest)]:
            hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                hypo_tokens=hypo['tokens'].int().cpu(),
                src_str=src_str,
                alignment=hypo['alignment'].int().cpu(),
                align_dict=align_dict,
                dst_dict=dst_dict,
                remove_bpe=args.remove_bpe,
            )
            print('H\t{}\t{}'.format(hypo['score'], hypo_str))
            print('A\t{}'.format(' '.join(map(str, alignment))))
Example #19
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, '--path required for generation!'

    print(args)

    if args.source_lang is None:
        args.source_lang = 'src'
    if args.target_lang is None:
        args.target_lang = 'tgt'

    src_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file,
    )
    dst_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file,
    )
    dataset = data.LanguageDatasets(
        src=args.source_lang,
        dst=args.target_lang,
        src_dict=src_dict,
        dst_dict=dst_dict,
    )
    models, model_args = utils.load_ensemble_for_inference(
        args.path,
        dataset.src_dict,
        dataset.dst_dict,
    )
    dataset.splits[args.gen_subset] = pytorch_translate_data.make_language_pair_dataset(
        source_file=args.source_text_file,
        target_file=args.target_text_file,
        source_dict=src_dict,
        target_dict=dst_dict,
        append_eos=model_args.append_eos_to_source,
        reverse_source=model_args.reverse_source,
    )

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f'| [{dataset.src}] dictionary: {dataset.src_dict} types')
    print(f'| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types')
    print(f'| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples')
    scorer, num_sentences, gen_timer = _generate_score(
        models=models,
        args=args,
        dataset=dataset,
        dataset_split=args.gen_subset,
    )
    print(f'| Translated {num_sentences} sentences ({gen_timer.n} tokens) '
          f'in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)')
    print(f'| Generate {args.gen_subset} with beam={args.beam}: '
          f'{scorer.result_string()}')
    return scorer.score()
Example #20
0
def load_lm(lm_path, cpu=False):
    # TODO: don't hardcode path
    args = argparse.Namespace(data=lm_path, path=lm_path+'/wiki103.pt', cpu=cpu, task='language_modeling')
    use_cuda = torch.cuda.is_available() and not args.cpu
    task = tasks.setup_task(args)
    print('| loading model(s) from {}'.format(args.path))
    models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task)
    d = task.target_dictionary
    scorer = SequenceScorer(models, d)
    if use_cuda:
        scorer.cuda()
    return task, scorer
Example #21
0
def main(args):
    check_args(args)
    import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 30000
    logger.info(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)

    # Set dictionary
    tgt_dict = task.target_dictionary

    if args.ctc or args.rnnt:
        tgt_dict.add_symbol("<ctc_blank>")
        if args.ctc:
            logger.info("| decoding a ctc model")
        if args.rnnt:
            logger.info("| decoding a rnnt model")

    # Load ensemble
    logger.info("| loading model(s) from {}".format(args.path))
    models, _model_args = utils.load_ensemble_for_inference(
        args.path.split(":"),
        task,
        model_arg_overrides=eval(args.model_overrides),  # noqa
    )
    optimize_models(args, use_cuda, models)

    # Initialize generator
    generator = task.build_generator(args)

    sp = spm.SentencePieceProcessor()
    sp.Load(os.path.join(args.data, 'spm.model'))

    path = args.input_file
    if not os.path.exists(path):
        raise FileNotFoundError("Audio file not found: {}".format(path))
    waveform, sample_rate = torchaudio.load_wav(path)
    waveform = waveform.mean(0, True)
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate,
                                              new_freq=16000)(waveform)
    import time
    print(sample_rate, waveform.shape)
    start = time.time()
    transcribe(waveform, args, task, generator, models, sp, tgt_dict)
    end = time.time()
    print(end - start)
    def build_model(cls, args, task):
        trained_encoder, trained_decoder = None, None
        pretrained = eval(args.pretrained)
        if pretrained:
            print("| Loading pretrained model")
            trained_model = utils.load_ensemble_for_inference(
                # not actually for inference, but loads pretrained model parameters
                filenames=[args.pretrained_checkpoint],
                task=task,
            )[0][0]
            trained_decoder = list(trained_model.children())[1]
            trained_encoder = list(trained_model.children())[0]

            # freeze pretrained model
            for param in trained_decoder.parameters():
                param.requires_grad = False
            for param in trained_encoder.parameters():
                param.requires_grad = False

        """Build a new model instance."""
        encoder = FConvEncoder(
            task.source_dictionary,
            embed_dim=args.encoder_embed_dim,
            convolutions=eval(args.encoder_layers),
            dropout=args.dropout,
            max_positions=args.max_source_positions,
            attention=eval(args.encoder_attention),
            attention_nheads=args.encoder_attention_nheads,
            mask_future_timesteps=(not args.not_mask_encoder_future_timesteps)
        )

        decoder = FConvDecoder(
            task.target_dictionary,
            embed_dim=args.decoder_embed_dim,
            convolutions=eval(args.decoder_layers),
            out_embed_dim=args.decoder_out_embed_dim,
            attention=eval(args.decoder_attention),
            dropout=args.dropout,
            max_positions=args.max_target_positions,
            selfattention=eval(args.self_attention),
            attention_nheads=args.multihead_attention_nheads,
            selfattention_nheads=args.multihead_self_attention_nheads,
            project_input=eval(args.project_input),
            gated_attention=eval(args.gated_attention),
            downsample=eval(args.downsample),
            pretrained=pretrained,
            trained_decoder=trained_decoder
        )
        model = FConvModelSelfAtt(encoder, decoder, trained_encoder)

        return model
Example #23
0
def load_model_asr(args):
    # Load ensemble
    logger.info("| loading model(s) from {}".format(args.path))
    models_asr, _model_args = utils.load_ensemble_for_inference(
        args.path.split(":"), task, model_arg_overrides={})
    optimize_models_asr(args, models_asr)

    # Initialize generator
    generator = task.build_generator(args)

    sp = spm.SentencePieceProcessor()
    sp.Load(os.path.join(args.data, 'spm.model'))

    return models_asr, sp, generator
Example #24
0
    def build_model(cls, args, task):
        trained_encoder, trained_decoder = None, None
        pretrained = eval(args.pretrained)
        if pretrained:
            print("| loading pretrained model")
            trained_model = utils.load_ensemble_for_inference(
                # not actually for inference, but loads pretrained model parameters
                filenames=[args.pretrained_checkpoint],
                task=task,
            )[0][0]
            trained_decoder = list(trained_model.children())[1]
            trained_encoder = list(trained_model.children())[0]

            # freeze pretrained model
            for param in trained_decoder.parameters():
                param.requires_grad = False
            for param in trained_encoder.parameters():
                param.requires_grad = False

        """Build a new model instance."""
        encoder = FConvEncoder(
            task.source_dictionary,
            embed_dim=args.encoder_embed_dim,
            convolutions=eval(args.encoder_layers),
            dropout=args.dropout,
            max_positions=args.max_source_positions,
            attention=eval(args.encoder_attention),
            attention_nheads=args.encoder_attention_nheads
        )

        decoder = FConvDecoder(
            task.target_dictionary,
            embed_dim=args.decoder_embed_dim,
            convolutions=eval(args.decoder_layers),
            out_embed_dim=args.decoder_out_embed_dim,
            attention=eval(args.decoder_attention),
            dropout=args.dropout,
            max_positions=args.max_target_positions,
            selfattention=eval(args.self_attention),
            attention_nheads=args.multihead_attention_nheads,
            selfattention_nheads=args.multihead_self_attention_nheads,
            project_input=eval(args.project_input),
            gated_attention=eval(args.gated_attention),
            downsample=eval(args.downsample),
            pretrained=pretrained,
            trained_decoder=trained_decoder
        )
        model = FConvModelSelfAtt(encoder, decoder, trained_encoder)

        return model
Example #25
0
    def build_model(cls, args, task):
        """Build a new model instance."""

        # make sure all arguments are present in older models
        base_architecture(args)

        dictionary = task.dictionary

        assert args.bert_path is not None
        args.short_seq_prob = 0.0
        task = BertTask(args, dictionary)
        models, _ = utils.load_ensemble_for_inference([args.bert_path], task, {
            'remove_head': True, 'remove_pooled' : True, 'save_masks' : False
        })
        assert len(models) == 1, 'ensembles are currently not supported for elmo embeddings'
        model = models[0]
        return FinetuningSquad(args, model)
Example #26
0
def calculate_bleu_on_subset(args, task, epoch_str: str, offset, dataset_split):
    # This is a trick to have generate use max_sentences_valid
    max_sentences_train = args.max_sentences
    args.max_sentences = args.max_sentences_valid
    datasets = []
    lang_pairs = []
    """
    In multi model training set up, evaluate one model at a time with
    corresponding dataset
    lang_pair is passed to identify model to be used for generation
    """
    if isinstance(task, PytorchTranslateSemiSupervised) or isinstance(
        task, DualLearningTask
    ):
        for key, dataset in task.datasets[dataset_split].datasets.items():
            datasets.append(dataset)
            lang_pairs.append(key)
    else:
        datasets = [task.dataset(dataset_split)]
        lang_pairs = [None]
    score_aggregator_fn = (
        task.score_aggregator if hasattr(task, "score_aggregator") else sum
    )
    scores = []
    ensemble_models, _ = utils.load_ensemble_for_inference(args.path.split(":"), task)
    for dataset, lang_pair in zip(datasets, lang_pairs):
        # Generate score
        scorer, num_sentences, gen_timer, translation_samples = generate.generate_score(
            args=args,
            task=task,
            dataset=dataset,
            models=ensemble_models,
            lang_pair=lang_pair,
        )
        scores.append(scorer.score())
        print(
            f"| epoch {epoch_str} | offset {offset} "
            f"| Eval on {dataset_split} {lang_pair if lang_pair else ''} subset "
            f"with beam={args.beam}: {scorer.result_string()}. "
            f"Generated {num_sentences} sentences ({gen_timer.n} tokens) "
            f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s).",
            flush=True,
        )
    # Set max_sentences to its original value
    args.max_sentences = max_sentences_train
    return score_aggregator_fn(scores), translation_samples
def main(args):
    """1 - Load Fairseq models in conventional Fairseq way."""
    # Setup task, e.g., translation
    task = tasks.setup_task(args)

    # Load ensemble
    model_paths = args.path.split(':')
    models, _ = utils.load_ensemble_for_inference(model_paths,
                                                  task,
                                                  model_arg_overrides=eval(
                                                      args.model_overrides))

    # Set dictionaries
    src_dict = task.source_dictionary
    model = models[0]
    """2 - Added src code by am-i-compositional team."""
    emb_dict = {}
    distance_per_token = defaultdict(lambda: {"twin": 0, "other": []})
    for i, embedding in enumerate(model.encoder.embed_tokens.weights):
        emb_dict[src_dict.symbols[i]] = embedding

    # For every adapted function, compute distance to its twin,
    # and to all other functions
    for f in ["repeat", "remove_second", "swap_first_last", "append"]:
        functions = [
            "repeat", "swap_first_last", "reverse", "echo", "copy", "shift",
            "append", "prepend", "remove_first", "remove_second"
        ]
        functions.remove(f)

        for f2 in functions:
            distance = spatial.distance.cosine(emb_dict[f], emb_dict[f2])
            distance_per_token[f]["other"].append(distance)

        twin = f + "_twin"
        distance = spatial.distance.cosine(emb_dict[f], emb_dict[twin])
        distance_per_token[f]["twin"] = distance

    averaged = dict()
    for token in distance_per_token:
        distance_twin = distance_per_token[token]['twin']
        distance_other = np.mean(distance_per_token[token]['other'])
        print(f"{token} vs twin: {distance_twin:.3f}, " +
              f"{token} vs other: {distance_other:.3f}")
        averaged[token] = (distance_twin, distance_other)
    return averaged
Example #28
0
    def __init__(self, model_path, dict_path):
        parser = options.get_eval_lm_parser()
        parsed_args = options.parse_args_and_arch(parser,
                                                  input_args=[None],
                                                  parse_known=True)[0]
        parsed_args.path = model_path
        parsed_args.dict = dict_path
        parsed_args.max_sentence = 1
        parsed_args.gen_subset = 'test'
        parsed_args.raw_text = True
        parsed_args.no_progress_bar = True
        import_user_module(parsed_args)
        print(parsed_args)

        task = tasks.setup_task(parsed_args)
        print('| loading model(s) from {}'.format(parsed_args.path))
        models, args = utils.load_ensemble_for_inference(
            parsed_args.path.split(':'),
            task,
            model_arg_overrides=eval(parsed_args.model_overrides),
        )
        for arg in vars(parsed_args).keys():
            if arg not in {
                    'self_target', 'future_target', 'past_target',
                    'tokens_per_sample', 'output_size_dictionary'
            }:
                setattr(args, arg, getattr(parsed_args, arg))
        task = tasks.setup_task(args)

        self.use_cuda = torch.cuda.is_available() and not parsed_args.cpu
        for model in models:
            model.make_generation_fast_()
            if self.use_cuda:
                model.cuda()
        assert len(models) > 0

        scorer = SequenceScorer(task.target_dictionary)

        self.args = args
        self.task = task
        self.models = models
        self.scorer = scorer
Example #29
0
    def __init__(self, path, data, use_cpu=True):
        # Create the language modeling task.
        self.args = FluencyArgs(path, data)
        self.task = tasks.setup_task(self.args)
        self.use_cuda = torch.cuda.is_available and not use_cpu

        # Load language model ensemble.
        models, model_args = utils.load_ensemble_for_inference(self.args.path.split(':'), self.task)
        self.models = models
        self.model_args = model_args

        # Optimize ensemble for generation.
        for model in self.models:
            model.make_generation_fast_()
            if self.use_cuda and self.model_args.fp16:
                model.half()

        # Create the sequence scorer.
        self.scorer = SequenceScorer(self.models, self.task.target_dictionary)
        if self.use_cuda:
            self.scorer.cuda()
Example #30
0
    def __init__(self, args, task):
        super().__init__(args, task)
        assert (args.teacher_path
                ), "Please specify at least one valid file for --teacher-path"
        use_cuda = torch.cuda.is_available() and not self.args.cpu

        # Load model ensemble from checkpoints
        self.teacher_models, self.teacher_model_args = pytorch_translate_utils.load_ensemble_for_inference(
            args.teacher_path.split(":"), task)

        # Move models to device and to evaluation mode
        if use_cuda:
            for model in self.teacher_models:
                model.cuda()
        for model in self.teacher_models:
            model.make_generation_fast_(beamable_mm_beam_size=None)

        self.kd_weight = getattr(args, "kd_weight", 1)
        if self.kd_weight < 0 or self.kd_weight > 1:
            raise ValueError(
                f"--kd-weight ({self.kd_weight}) must be in [0, 1]")
Example #31
0
def load(model_names, use_cpu):
    parser = options.get_generation_parser(interactive=True)
    args = options.parse_args_and_arch(parser)
    args.encoder_embed_path = None  #'/data/wangzhe/SematicSeg/mlconvgec2018/models/embeddings/wiki_model.vec'
    args.beam = 12
    args.nbest = args.beam
    args.num_workers = 12
    args.data = '/data/wangzhe/SematicSeg/mlconvgec2018/models/data_bin'
    args.s = 'src'
    args.t = 'trg'
    task = tasks.setup_task(args)
    use_cuda = torch.cuda.is_available() and not use_cpu
    #print('| loading model(s) from {}'.format(args.path))
    models, _model_args = utils.load_ensemble_for_inference(
        model_names, task, model_arg_overrides=eval(args.model_overrides))
    # Set dictionaries
    src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Initialize generator
    generator = task.build_generator(args)

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    max_positions = utils.resolve_max_positions(
        task.max_positions(), *[model.max_positions() for model in models])
    return models, generator, align_dict, max_positions, args, use_cuda, task, src_dict, tgt_dict
def main(args):
    check_args(args)
    import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 30000
    logger.info(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)

    # Set dictionary
    tgt_dict = task.target_dictionary

    # Load ensemble
    logger.info("| loading model(s) from {}".format(args.path))
    models, _model_args = utils.load_ensemble_for_inference(
        args.path.split(":"),
        task,
        model_arg_overrides=eval(args.model_overrides),  # noqa
    )
    optimize_models(args, use_cuda, models)

    # Initialize generator
    generator = task.build_generator(args)

    sp = spm.SentencePieceProcessor()
    sp.Load(os.path.join(args.data, 'spm.model'))

    print("READY!")
    for (waveform, sample_rate) in get_microphone_chunks():
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate,
                                                  new_freq=16000)(
                                                      waveform.reshape(1, -1))
        transcription = transcribe(waveform, args, task, generator, models, sp,
                                   tgt_dict)
        print("{}: {}".format(dt.datetime.now().strftime('%H:%M:%S'),
                              transcription[0][0]))
def model_fn(model_dir):
    
    model_name = 'checkpoint_best.pt'
    model_path = os.path.join(model_dir, model_name)

    logger.info('Loading the model')
    with open(model_path, 'rb') as f:
        model_info = torch.load(f, map_location=torch.device('cpu'))

    # Will be overidden by the model_info['args'] - need to keep for pre-trained models   
    parser = options.get_generation_parser(interactive=True)
    # get args for FairSeq by converting the hyperparameters as if they were command-line arguments
    argv_copy = copy.deepcopy(sys.argv)
    # remove the modifications we did in the command-line arguments
    sys.argv[1:] = ['--path', model_path, model_dir]
    args = options.parse_args_and_arch(parser)
    # restore previous command-line args
    sys.argv = argv_copy
    
    saved_args = model_info['args']
    for key, value in vars(saved_args).items():
        setattr(args, key, value)

    args.data = [model_dir]
    print(args)

    # Setup task, e.g., translation
    task = tasks.setup_task(args)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info('Current device: {}'.format(device))

    model_paths = [os.path.join(model_dir, model_name)]
    models, model_args = utils.load_ensemble_for_inference(model_paths, task, model_arg_overrides={})

    # Set dictionaries
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()

    # Initialize generator
    translator = SequenceGenerator(
        models, tgt_dict, beam_size=args.beam, minlen=args.min_len,
        stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized),
        len_penalty=args.lenpen, unk_penalty=args.unkpen,
        sampling=args.sampling, sampling_topk=args.sampling_topk, sampling_temperature=args.sampling_temperature,
        diverse_beam_groups=args.diverse_beam_groups, diverse_beam_strength=args.diverse_beam_strength,
    )

    if device.type == 'cuda':
        translator.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    # align_dict = utils.load_align_dict(args.replace_unk)
    align_dict = utils.load_align_dict(None)


    max_positions = utils.resolve_max_positions(
        task.max_positions(),
        *[model.max_positions() for model in models]
    )

    return dict(
        translator=translator,
        task=task,
        max_positions=max_positions,
        align_dict=align_dict,
        tgt_dict=tgt_dict,
        args=args,
        device=device,
    )
Example #34
0
def main(args):
    assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.raw_text, \
        '--replace-unk requires a raw text dataset (--raw-text)'

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)
    print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset))))

    # Set dictionaries
    src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task, model_arg_overrides=eval(args.model_overrides))

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]
        ),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=8,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    if args.score_reference:
        translator = SequenceScorer(models, task.target_dictionary)
    else:
        translator = SequenceGenerator(
            models, task.target_dictionary, beam_size=args.beam, minlen=args.min_len,
            stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized),
            len_penalty=args.lenpen, unk_penalty=args.unkpen,
            sampling=args.sampling, sampling_topk=args.sampling_topk, sampling_temperature=args.sampling_temperature,
            diverse_beam_groups=args.diverse_beam_groups, diverse_beam_strength=args.diverse_beam_strength,
        )

    if use_cuda:
        translator.cuda()

    # Generate and compute BLEU score
    scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    num_sentences = 0
    has_target = True
    with progress_bar.build_progress_bar(args, itr) as t:
        if args.score_reference:
            translations = translator.score_batched_itr(t, cuda=use_cuda, timer=gen_timer)
        else:
            translations = translator.generate_batched_itr(
                t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b,
                cuda=use_cuda, timer=gen_timer, prefix_size=args.prefix_size,
            )

        wps_meter = TimeMeter()
        for sample_id, src_tokens, target_tokens, hypos in translations:
            # Process input and ground truth
            has_target = target_tokens is not None
            target_tokens = target_tokens.int().cpu() if has_target else None

            # Either retrieve the original sentences or regenerate them from tokens.
            if align_dict is not None:
                src_str = task.dataset(args.gen_subset).src.get_original_text(sample_id)
                target_str = task.dataset(args.gen_subset).tgt.get_original_text(sample_id)
            else:
                src_str = src_dict.string(src_tokens, args.remove_bpe)
                if has_target:
                    target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True)

            if not args.quiet:
                print('S-{}\t{}'.format(sample_id, src_str))
                if has_target:
                    print('T-{}\t{}'.format(sample_id, target_str))

            # Process top predictions
            for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo['tokens'].int().cpu(),
                    src_str=src_str,
                    alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None,
                    align_dict=align_dict,
                    tgt_dict=tgt_dict,
                    remove_bpe=args.remove_bpe,
                )

                if not args.quiet:
                    print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str))
                    print('P-{}\t{}'.format(
                        sample_id,
                        ' '.join(map(
                            lambda x: '{:.4f}'.format(x),
                            hypo['positional_scores'].tolist(),
                        ))
                    ))

                    if args.print_alignment:
                        print('A-{}\t{}'.format(
                            sample_id,
                            ' '.join(map(lambda x: str(utils.item(x)), alignment))
                        ))

                # Score only the top hypothesis
                if has_target and i == 0:
                    if align_dict is not None or args.remove_bpe is not None:
                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
                        target_tokens = tokenizer.Tokenizer.tokenize(
                            target_str, tgt_dict, add_if_not_exist=True)
                    scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(src_tokens.size(0))
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += 1

    print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format(
        num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
Example #35
0
def main(parsed_args):
    assert parsed_args.path is not None, '--path required for evaluation!'

    print(parsed_args)

    use_cuda = torch.cuda.is_available() and not parsed_args.cpu

    task = tasks.setup_task(parsed_args)

    # Load ensemble
    print('| loading model(s) from {}'.format(parsed_args.path))
    models, args = utils.load_ensemble_for_inference(parsed_args.path.split(':'), task)

    args.__dict__.update(parsed_args.__dict__)
    print(args)

    task.args = args

    # Load dataset splits
    task.load_dataset(args.gen_subset)
    print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset))))

    # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer)
    for model in models:
        model.make_generation_fast_()
        if args.fp16:
            model.half()

    assert len(models) > 0

    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens or 36000,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(*[
            model.max_positions() for model in models
        ]),
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        ignore_invalid_inputs=True,
    ).next_epoch_itr(shuffle=False)

    gen_timer = StopwatchMeter()
    scorer = SequenceScorer(models, task.target_dictionary)
    if use_cuda:
        scorer.cuda()

    score_sum = 0.
    count = 0

    if args.remove_bpe is not None:
        bpe_cont = args.remove_bpe.rstrip()
        bpe_toks = set(i for i in range(len(task.dictionary)) if task.dictionary[i].endswith(bpe_cont))
        bpe_len = len(bpe_cont)
    else:
        bpe_toks = None
        bpe_len = 0

    word_stats = dict()

    with progress_bar.build_progress_bar(args, itr) as t:
        results = scorer.score_batched_itr(t, cuda=use_cuda, timer=gen_timer)
        wps_meter = TimeMeter()
        for _, src_tokens, __, hypos in results:
            for hypo in hypos:
                pos_scores = hypo['positional_scores']

                skipped_toks = 0
                if bpe_toks is not None:
                    for i in range(len(hypo['tokens']) - 1):
                        if hypo['tokens'][i].item() in bpe_toks:
                            skipped_toks += 1
                            pos_scores[i + 1] += pos_scores[i]
                            pos_scores[i] = 0

                inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq(float('-inf'))
                if inf_scores.any():
                    print('| Skipping tokens with inf scores:',
                          task.target_dictionary.string(hypo['tokens'][inf_scores.nonzero()]))
                    pos_scores = pos_scores[(~inf_scores).nonzero()]
                score_sum += utils.item(pos_scores.sum())
                count += pos_scores.numel() - skipped_toks

                if args.output_word_probs or args.output_word_stats:
                    w = ''
                    word_prob = []
                    is_bpe = False
                    for i in range(len(hypo['tokens'])):
                        w_ind = hypo['tokens'][i].item()
                        w += task.dictionary[w_ind]
                        if bpe_toks is not None and w_ind in bpe_toks:
                            w = w[:-bpe_len]
                            is_bpe = True
                        else:
                            word_prob.append((w, pos_scores[i].item()))
                            word_stats.setdefault(w, WordStat(w, is_bpe)).add(pos_scores[i].item())
                            is_bpe = False
                            w = ''
                    if args.output_word_probs:
                        print('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob))

            wps_meter.update(src_tokens.size(0))
            t.log({'wps': round(wps_meter.avg)})

    avg_nll_loss = -score_sum / count
    print('| Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format(gen_timer.n, gen_timer.sum, 1. / gen_timer.avg))
    print('| Loss: {:.4f}, Perplexity: {:.2f}'.format(avg_nll_loss, np.exp(avg_nll_loss)))

    if args.output_word_stats:
        for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True):
            print(ws)
Example #36
0
def main(args):
    if args.buffer_size < 1:
        args.buffer_size = 1
    if args.max_tokens is None and args.max_sentences is None:
        args.max_sentences = 1

    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert not args.max_sentences or args.max_sentences <= args.buffer_size, \
        '--max-sentences/--batch-size cannot be larger than --buffer-size'

    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Setup task, e.g., translation
    task = tasks.setup_task(args)

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    model_paths = args.path.split(':')
    models, model_args = utils.load_ensemble_for_inference(model_paths, task, model_arg_overrides=eval(args.model_overrides))

    # Set dictionaries
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()

    # Initialize generator
    translator = SequenceGenerator(
        models, tgt_dict, beam_size=args.beam, minlen=args.min_len,
        stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized),
        len_penalty=args.lenpen, unk_penalty=args.unkpen,
        sampling=args.sampling, sampling_topk=args.sampling_topk, sampling_temperature=args.sampling_temperature,
        diverse_beam_groups=args.diverse_beam_groups, diverse_beam_strength=args.diverse_beam_strength,
    )

    if use_cuda:
        translator.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    def make_result(src_str, hypos):
        result = Translation(
            src_str='O\t{}'.format(src_str),
            hypos=[],
            pos_scores=[],
            alignments=[],
        )

        # Process top predictions
        for hypo in hypos[:min(len(hypos), args.nbest)]:
            hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                hypo_tokens=hypo['tokens'].int().cpu(),
                src_str=src_str,
                alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None,
                align_dict=align_dict,
                tgt_dict=tgt_dict,
                remove_bpe=args.remove_bpe,
            )
            result.hypos.append('H\t{}\t{}'.format(hypo['score'], hypo_str))
            result.pos_scores.append('P\t{}'.format(
                ' '.join(map(
                    lambda x: '{:.4f}'.format(x),
                    hypo['positional_scores'].tolist(),
                ))
            ))
            result.alignments.append(
                'A\t{}'.format(' '.join(map(lambda x: str(utils.item(x)), alignment)))
                if args.print_alignment else None
            )
        return result

    def process_batch(batch):
        tokens = batch.tokens
        lengths = batch.lengths

        if use_cuda:
            tokens = tokens.cuda()
            lengths = lengths.cuda()

        translations = translator.generate(
            tokens,
            lengths,
            maxlen=int(args.max_len_a * tokens.size(1) + args.max_len_b),
        )

        return [make_result(batch.srcs[i], t) for i, t in enumerate(translations)]

    max_positions = utils.resolve_max_positions(
        task.max_positions(),
        *[model.max_positions() for model in models]
    )

    if args.buffer_size > 1:
        print('| Sentence buffer size:', args.buffer_size)
    print('| Type the input sentence and press return:')
    for inputs in buffered_read(args.buffer_size):
        indices = []
        results = []
        for batch, batch_indices in make_batches(inputs, args, task, max_positions):
            indices.extend(batch_indices)
            results += process_batch(batch)

        for i in np.argsort(indices):
            result = results[i]
            print(result.src_str)
            for hypo, pos_scores, align in zip(result.hypos, result.pos_scores, result.alignments):
                print(hypo)
                print(pos_scores)
                if align is not None:
                    print(align)
Example #37
0
def main():
    parser = options.get_parser('Generation')
    parser.add_argument('--path', metavar='FILE', required=True, action='append',
                        help='path(s) to model file(s)')
    dataset_args = options.add_dataset_args(parser)
    dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N',
                              help='batch size')
    dataset_args.add_argument('--gen-subset', default='test', metavar='SPLIT',
                              help='data subset to generate (train, valid, test)')
    options.add_generation_args(parser)

    args = parser.parse_args()
    if args.no_progress_bar and args.log_format is None:
        args.log_format = 'none'
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset
    if args.replace_unk is None:
        dataset = data.load_dataset(args.data, [args.gen_subset], args.source_lang, args.target_lang)
    else:
        dataset = data.load_raw_text_dataset(args.data, [args.gen_subset], args.source_lang, args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    # Load ensemble
    print('| loading model(s) from {}'.format(', '.join(args.path)))
    models, _ = utils.load_ensemble_for_inference(args.path, dataset.src_dict, dataset.dst_dict)

    print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict)))
    print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset.splits[args.gen_subset])))

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam)

    # Initialize generator
    translator = SequenceGenerator(
        models, beam_size=args.beam, stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized), len_penalty=args.lenpen,
        unk_penalty=args.unkpen)
    if use_cuda:
        translator.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Generate and compute BLEU score
    scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk())
    max_positions = min(model.max_encoder_positions() for model in models)
    itr = dataset.eval_dataloader(
        args.gen_subset, max_sentences=args.batch_size, max_positions=max_positions,
        skip_invalid_size_inputs_valid_test=args.skip_invalid_size_inputs_valid_test)
    num_sentences = 0
    with utils.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        gen_timer = StopwatchMeter()
        translations = translator.generate_batched_itr(
            t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b,
            cuda_device=0 if use_cuda else None, timer=gen_timer)
        for sample_id, src_tokens, target_tokens, hypos in translations:
            # Process input and ground truth
            target_tokens = target_tokens.int().cpu()
            # Either retrieve the original sentences or regenerate them from tokens.
            if align_dict is not None:
                src_str = dataset.splits[args.gen_subset].src.get_original_text(sample_id)
                target_str = dataset.splits[args.gen_subset].dst.get_original_text(sample_id)
            else:
                src_str = dataset.src_dict.string(src_tokens, args.remove_bpe)
                target_str = dataset.dst_dict.string(target_tokens, args.remove_bpe, escape_unk=True)

            if not args.quiet:
                print('S-{}\t{}'.format(sample_id, src_str))
                print('T-{}\t{}'.format(sample_id, target_str))

            # Process top predictions
            for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo['tokens'].int().cpu(),
                    src_str=src_str,
                    alignment=hypo['alignment'].int().cpu(),
                    align_dict=align_dict,
                    dst_dict=dataset.dst_dict,
                    remove_bpe=args.remove_bpe)

                if not args.quiet:
                    print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str))
                    print('A-{}\t{}'.format(sample_id, ' '.join(map(str, alignment))))

                # Score only the top hypothesis
                if i == 0:
                    if align_dict is not None or args.remove_bpe is not None:
                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
                        target_tokens = tokenizer.Tokenizer.tokenize(target_str,
                                                                     dataset.dst_dict,
                                                                     add_if_not_exist=True)
                    scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(src_tokens.size(0))
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += 1

    print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.format(
        num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg))
    print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))