Ejemplo n.º 1
0
    def _get_whole_word_mask(self):
        # create masked input and targets
        if self.args.mask_whole_words:
            bpe = encoders.build_bpe(self.args)
            if bpe is not None:

                def is_beginning_of_word(i):
                    if i < self.source_dictionary.nspecial:
                        # special elements are always considered beginnings
                        return True
                    tok = self.source_dictionary[i]
                    if tok.startswith("madeupword"):
                        return True
                    try:
                        return bpe.is_beginning_of_word(tok)
                    except ValueError:
                        return True

                mask_whole_words = torch.ByteTensor(
                    list(
                        map(is_beginning_of_word,
                            range(len(self.source_dictionary)))))
        else:
            mask_whole_words = None
        return mask_whole_words
Ejemplo n.º 2
0
    def __init__(self, cfg, task, models):
        super().__init__()
        self.cfg = cfg
        self.task = task
        self.models = nn.ModuleList(models)
        self.src_dict = task.source_dictionary
        self.tgt_dict = task.target_dictionary

        # optimize model for generation
        for model in self.models:
            model.prepare_for_inference_(cfg)

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(cfg.generation.replace_unk)

        self.tokenizer = encoders.build_tokenizer(cfg.tokenizer)
        self.bpe = encoders.build_bpe(cfg.bpe)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(), *[model.max_positions() for model in models]
        )

        # this is useful for determining the device
        self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
Ejemplo n.º 3
0
    def __init__(self, cfg, task, model):
        super().__init__()
        self.cfg = cfg
        self.task = task
        self.model = model

        self.bpe = encoders.build_bpe(cfg.bpe)

        # this is useful for determining the device
        self.register_buffer("_float_tensor",
                             torch.tensor([0], dtype=torch.float))
Ejemplo n.º 4
0
def get_whole_word_mask(args, dictionary):
    bpe = encoders.build_bpe(args)
    if bpe is not None:

        def is_beginning_of_word(i):
            if i < dictionary.nspecial:
                # special elements are always considered beginnings
                return True
            tok = dictionary[i]
            if tok.startswith("madeupword"):
                return True
            try:
                return bpe.is_beginning_of_word(tok)
            except ValueError:
                return True

        mask_whole_words = torch.ByteTensor(
            list(map(is_beginning_of_word, range(len(dictionary)))))
        return mask_whole_words
    return None
Ejemplo n.º 5
0
 def build_bpe(self, args):
     """Build the tokenizer for this task."""
     return encoders.build_bpe(args)
Ejemplo n.º 6
0
 def __init__(self, bpe, **kwargs):
     super().__init__()
     args = argparse.Namespace(bpe=bpe, **kwargs)
     self.bpe = encoders.build_bpe(args)
     assert self.bpe is not None
Ejemplo n.º 7
0
 def build_bpe(self, args):
     logger.info(f"tokenizer: {self.data_cfg.bpe_tokenizer}")
     return encoders.build_bpe(Namespace(**self.data_cfg.bpe_tokenizer))