Example #1
0
 def add_args(parser):
     """Add task-specific arguments to the parser."""
     LanguageModelingTask.add_args(parser)
     parser.add_argument(
         "--multiple-datasets",
         action="store_true",
         help=
         "if set, treats paths in data as separate datasets to be combined, "
         "rather than as splits of a single dataset",
     )
     parser.add_argument(
         "--prepend-ds-name",
         action="store_true",
         help=
         "if set and multiple-datasets is also set, prepends the name of the ds instead of "
         "bos/eos token",
     )
     parser.add_argument(
         "--generic-ds-name-chance",
         type=float,
         metavar="P",
         default=0,
         help=
         'if multiple datasets is used, sets the prepended ds name to "generic" '
         "this percentage of time",
     )
     parser.add_argument(
         "--subsample-splits",
         type=str,
         metavar="SPLITS",
         default="valid",
         help=
         "if multiple datasets is used, subsamples specified split(colon separated) to "
         "the size of the smallest split",
     )
 def add_args(parser):
     """Add task-specific arguments to the parser."""
     # fmt: off
     LanguageModelingTask.add_args(parser)
     parser.add_argument('--method',
                         default='hMoEup',
                         choices=['sMoElp', 'sMoEup', 'hMoElp', 'hMoEup'])
     parser.add_argument('--num-experts',
                         default=3,
                         type=int,
                         metavar='N',
                         help='number of experts')
     parser.add_argument('--mean-pool-gating-network',
                         action='store_true',
                         help='use a simple mean-pooling gating network')
     parser.add_argument('--mean-pool-gating-network-dropout',
                         type=float,
                         help='dropout for mean-pooling gating network')
     parser.add_argument(
         '--mean-pool-gating-network-encoder-dim',
         type=int,
         help='encoder output dim for mean-pooling gating network')
     parser.add_argument('--gen-expert',
                         type=int,
                         default=0,
                         help='which expert to use for generation')
 def add_args(parser):
     """Add task-specific arguments to the parser."""
     # fmt: off
     LanguageModelingTask.add_args(parser)
     parser.add_argument('--dict',
                         default=None,
                         type=str,
                         help='path to the dictionary')
    def add_args(parser):
        parser.add_argument('--compute-metrics-interval', type=int, default=250,
                            help='compute custom metrics in the criterion once every `compute-metrics-interval` batches')
        parser.add_argument('--sequence-level-train-rate', type=float, default=0.0,
                            help='proportion of training steps to perform sequence level training')
        # - candidate_penalty
        parser.add_argument('--candidate-type', choices=['prev_context'],
                      default='prev_context')
        parser.add_argument('--rank-alpha', type=float)
        # - sequence
        parser.add_argument('--sequence-ngram-n', type=int, default=1)
        parser.add_argument('--sequence-prefix-length', type=int, default=16)
        parser.add_argument('--sequence-completion-length', type=int, default=48)
        parser.add_argument('--sequence-candidate-type', choices=['repeat', 'random'], default='repeat')
        parser.add_argument('--mask-p', type=float, default=0.5)

        # fmt: off
        LanguageModelingTask.add_args(parser)
Example #5
0
    def test_eval_dataloader(self):
        dictionary = test_utils.dummy_dictionary(10)
        assert len(dictionary) == 14  # 4 extra special symbols
        assert dictionary.pad() == 1

        dataset = test_utils.TestDataset([
            torch.tensor([4, 5, 6, 7], dtype=torch.long),
            torch.tensor([8, 9, 10, 11], dtype=torch.long),
            torch.tensor([12, 13], dtype=torch.long),
        ])
        dataset = MonolingualDataset(dataset,
                                     sizes=[4, 4, 2],
                                     src_vocab=dictionary)

        config = LanguageModelingConfig(tokens_per_sample=4)
        task = LanguageModelingTask(config, dictionary)

        eval_dataloader = task.eval_lm_dataloader(
            dataset=dataset,
            batch_size=1,
            context_window=2,
            num_workers=0,
        )

        batch = next(eval_dataloader)
        assert batch["net_input"]["src_tokens"][0].tolist() == [
            4, 5, 6, 7, 1, 1
        ]
        assert batch["target"][0].tolist() == [4, 5, 6, 7, 1, 1]

        batch = next(eval_dataloader)
        assert batch["net_input"]["src_tokens"][0].tolist() == [
            6, 7, 8, 9, 10, 11
        ]
        assert batch["target"][0].tolist() == [1, 1, 8, 9, 10, 11]

        batch = next(eval_dataloader)
        assert batch["net_input"]["src_tokens"][0].tolist() == [10, 11, 12, 13]
        assert batch["target"][0].tolist() == [1, 1, 12, 13]
Example #6
0
    def load_model(self, checkpoint: str, vocab_dir: str, **kwargs):
        checkpoint = torch.load(checkpoint)
        checkpoint_args = checkpoint["cfg"]["model"]

        task_args = TaskArgs(
            vocab_dir,
            checkpoint["cfg"]["model"].output_dictionary_size,
        )
        task = LanguageModelingTask.setup_task(task_args)
        model = FairseqPretrainedModel.build_model(checkpoint_args, task)
        model.load_state_dict(checkpoint["model"])

        config = FairseqConfig(
            model.decoder.embed_tokens.num_embeddings,
            model.decoder.output_embed_dim,
            model.decoder.num_layers,
        )
        model.config = config

        return model
Example #7
0
    def __init__(self, args, tgt_dict):
        super().__init__(args, tgt_dict)

        self.unit_lm = args['unit_lm']

        self.lexicon = load_words(args['lexicon']) if args['lexicon'] else None
        self.idx_to_wrd = {}

        checkpoint = torch.load(args['lm_model'], map_location="cpu")

        if "cfg" in checkpoint and checkpoint["cfg"] is not None:
            lm_args = checkpoint["cfg"]
        else:
            lm_args = convert_namespace_to_omegaconf(checkpoint["args"])

        with open_dict(lm_args.task):
            lm_args.task.data = osp.dirname(args['lm_model'])

        task = LanguageModelingTask.setup_task(lm_args.task)
        model = task.build_model(lm_args.model)

        model.load_state_dict(checkpoint["model"], strict=False)

        self.trie = Trie(self.vocab_size, self.silence)

        self.word_dict = task.dictionary
        self.unk_word = self.word_dict.unk()

        self.lm = FairseqLM(self.word_dict, model)

        if self.lexicon:
            start_state = self.lm.start(False)
            for i, (word, spellings) in enumerate(self.lexicon.items()):

                if self.unit_lm:
                    word_idx = i
                    self.idx_to_wrd[i] = word
                    score = 0
                else:
                    word_idx = self.word_dict.index(word)
                    _, score = self.lm.score(start_state,
                                             word_idx,
                                             no_cache=True)

                for spelling in spellings:
                    spelling_idxs = [
                        tgt_dict.index(token) for token in spelling
                    ]

                    self.trie.insert(spelling_idxs, word_idx, score)
            self.trie.smear(SmearingMode.MAX)

            self.decoder_opts = LexiconDecoderOptions(
                beam_size=args['beam_size'],
                beam_size_token=args['beam_size_token'],
                beam_threshold=args['beam_threshold'],
                lm_weight=args['lm_weight'],
                word_score=args['word_score'],
                unk_score=args['unk_score'],
                sil_score=args['sil_score'],
                log_add=False,
                criterion_type=self.criterion_type,
            )

            self.decoder = LexiconDecoder(
                self.decoder_opts,
                self.trie,
                self.lm,
                self.silence,
                self.blank,
                self.unk_word,
                [],
                self.unit_lm,
            )
        else:
            assert args.unit_lm, "lexicon free decoding can only be done with a unit language model"
            from flashlight.lib.text.decoder import LexiconFreeDecoder, LexiconFreeDecoderOptions

            d = {w: [[w]] for w in tgt_dict.symbols}
            self.word_dict = create_word_dict(d)
            self.lm = KenLM(args['lm_model'], self.word_dict)
            self.decoder_opts = LexiconFreeDecoderOptions(
                beam_size=args['beam_size'],
                beam_size_token=args['beam_size_token'],
                beam_threshold=args['beam_threshold'],
                lm_weight=args['lm_weight'],
                sil_score=args['sil_score'],
                log_add=False,
                criterion_type=self.criterion_type,
            )
            self.decoder = LexiconFreeDecoder(self.decoder_opts, self.lm,
                                              self.silence, self.blank, [])
    def build_generator(self,
                        models,
                        args,
                        seq_gen_cls=None,
                        extra_gen_cls_kwargs=None):
        if getattr(args, "score_reference", False):
            raise NotImplementedError()
        else:
            from .noisy_channel_sequence_generator import NoisyChannelSequenceGenerator
            use_cuda = torch.cuda.is_available() and not self.args.cpu
            assert self.args.lm_model is not None, '--lm-model required for noisy channel generation!'
            assert self.args.lm_data is not None, '--lm-data required for noisy channel generation to map between LM and bitext vocabs'
            if self.args.channel_model is not None:
                import copy
                ch_args_task = copy.deepcopy(self.args)
                tmp = ch_args_task.source_lang
                ch_args_task.source_lang = ch_args_task.target_lang
                ch_args_task.target_lang = tmp
                ch_args_task._name = 'translation'
                channel_task = TranslationTask.setup_task(ch_args_task)

            arg_dict = {}
            arg_dict['task'] = 'language_modeling'
            arg_dict['sample_break_mode'] = 'eos'
            arg_dict['data'] = self.args.lm_data
            arg_dict['output_dictionary_size'] = -1
            lm_args = argparse.Namespace(**arg_dict)
            lm_task = LanguageModelingTask.setup_task(lm_args)
            lm_dict = lm_task.output_dictionary

            if self.args.channel_model is not None:
                channel_models, _ = checkpoint_utils.load_model_ensemble(
                    self.args.channel_model.split(':'), task=channel_task)

                for model in channel_models:
                    model.make_generation_fast_(
                        beamable_mm_beam_size=None
                        if args.no_beamable_mm else args.beam,
                        need_attn=args.print_alignment,
                    )
                    if self.args.fp16:
                        model.half()
                    if use_cuda:
                        model.cuda()
            else:
                channel_models = None

            lm_models, _ = checkpoint_utils.load_model_ensemble(
                self.args.lm_model.split(':'), task=lm_task)

            for model in lm_models:
                model.make_generation_fast_(
                    beamable_mm_beam_size=None
                    if args.no_beamable_mm else args.beam,
                    need_attn=args.print_alignment,
                )
                if self.args.fp16:
                    model.half()
                if use_cuda:
                    model.cuda()
            return NoisyChannelSequenceGenerator(
                combine_method=self.args.combine_method,
                tgt_dict=self.target_dictionary,
                src_dict=self.source_dictionary,
                beam_size=getattr(args, 'beam', 5),
                max_len_a=getattr(args, 'max_len_a', 0),
                max_len_b=getattr(args, 'max_len_b', 200),
                min_len=getattr(args, 'min_len', 1),
                len_penalty=getattr(args, 'lenpen', 1),
                unk_penalty=getattr(args, 'unkpen', 0),
                temperature=getattr(args, 'temperature', 1.),
                match_source_len=getattr(args, 'match_source_len', False),
                no_repeat_ngram_size=getattr(args, 'no_repeat_ngram_size', 0),
                normalize_scores=(not getattr(args, 'unnormalized', False)),
                channel_models=channel_models,
                k2=getattr(self.args, 'k2', 50),
                ch_weight=getattr(self.args, 'ch_wt', 1),
                channel_scoring_type=self.args.channel_scoring_type,
                top_k_vocab=self.args.top_k_vocab,
                lm_models=lm_models,
                lm_dict=lm_dict,
                lm_weight=getattr(self.args, 'lm_wt', 1),
                normalize_lm_scores_by_tgt_len=getattr(
                    self.args, 'normalize_lm_scores_by_tgt_len', False),
            )