Example #1
0
    def __init__(
        self,
        vocab_path: Optional[str] = None,
        vocab_list: Optional[List[str]] = None,
        special_token_replacements=SPECIAL_TOKEN_REPLACEMENT,
    ):
        super().__init__()
        assert vocab_path or vocab_list, "vocab_path or vocab_list is required"
        assert not (
            vocab_path and vocab_list
        ), "vocab_path and vocab_list are mutual exclusive"

        if vocab_list:
            self.vocab = ScriptVocabulary(vocab_list)
        else:
            with PathManager.open(vocab_path) as f:
                vocab = build_fairseq_vocab(
                    f, special_token_replacements=special_token_replacements
                )
                self.vocab = ScriptVocabulary(
                    list(vocab),
                    pad_idx=vocab.get_pad_index(-1),
                    bos_idx=vocab.get_bos_index(-1),
                    eos_idx=vocab.get_eos_index(-1),
                    unk_idx=vocab.get_unk_index(-1),
                    unk_token=vocab.unk_token,
                )
Example #2
0
    def __init__(
        self,
        vocab_path: Optional[str] = None,
        vocab_list: Optional[List[str]] = None,
        special_token_replacements=SPECIAL_TOKEN_REPLACEMENT,
        add_bos: bool = False,
        add_eos: bool = False,
        max_seq_len: int = 2**30,
    ):
        super().__init__()
        assert vocab_path or vocab_list, "vocab_path or vocab_list is required"
        assert not (vocab_path and vocab_list
                    ), "vocab_path and vocab_list are mutual exclusive"

        if vocab_list:
            self.vocab = ScriptVocabulary(vocab_list)
        else:
            with PathManager.open(vocab_path) as f:
                vocab = build_fairseq_vocab(
                    f, special_token_replacements=special_token_replacements)
                self.vocab = ScriptVocabulary(
                    list(vocab),
                    pad_idx=vocab.get_pad_index(-1),
                    bos_idx=vocab.get_bos_index(-1),
                    eos_idx=vocab.get_eos_index(-1),
                    unk_idx=vocab.get_unk_index(-1),
                    unk_token=vocab.unk_token,
                )
        # TODO T77728853 We need to combine truncate with BOS/EOS as they impact each other
        # Need to find a nicer way to do this, as this can't be chained.
        self.add_bos = add_bos
        self.add_eos = add_eos
        # Make room for bos and eos from max_seq_len if true
        self.truncate_transform = TruncateTransform(max_seq_len - add_bos -
                                                    add_eos)
Example #3
0
    def __init__(
        self, vocab_path: Optional[str] = None, vocab_list: Optional[List[str]] = None
    ):
        super().__init__()
        assert vocab_path or vocab_list, "vocab_path or vocab_list is required"
        assert not (
            vocab_path and vocab_list
        ), "vocab_path and vocab_list are mutual exclusive"

        if vocab_list:
            self.vocab = ScriptVocabulary(vocab_list)
        else:
            with PathManager.open(vocab_path) as f:
                special_token_replacements = {
                    "[UNK]": UNK,
                    "[PAD]": PAD,
                    "[CLS]": BOS,
                    "[MASK]": MASK,
                    "[SEP]": EOS,
                }
                vocab = build_fairseq_vocab(
                    f, special_token_replacements=special_token_replacements
                )
                self.vocab = ScriptVocabulary(
                    list(vocab),
                    pad_idx=vocab.get_pad_index(-1),
                    bos_idx=vocab.get_bos_index(-1),
                    eos_idx=vocab.get_eos_index(-1),
                    unk_idx=vocab.get_unk_index(-1),
                )
Example #4
0
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK])
     self.normalizer = tensorizers["dense"].normalizer
     self.model = traced_model
     self.output_layer = output_layer
     self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
Example #5
0
    def test_xlm_token_tensorizer(self):
        vocab = self._mock_vocab()

        xlm = ScriptXLMTensorizer(
            tokenizer=ScriptDoNothingTokenizer(),
            token_vocab=vocab,
            language_vocab=ScriptVocabulary(["ar", "cn", "en"]),
            max_seq_len=256,
            default_language="en",
        )
        rand_tokens = [
            [str(random.randint(100, 200)) for i in range(20)],
            [str(random.randint(100, 200)) for i in range(10)],
        ]

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens))
        tokens = tokens.tolist()
        # eos token
        self.assertEqual(tokens[0][0], 202)
        self.assertEqual(tokens[0][-1], 202)
        # pad token
        self.assertEqual(tokens[1][12:], [200] * 10)

        languages = languages.tolist()
        self.assertEqual(languages[0], [2] * len(tokens[0]))
        self.assertEqual(languages[1][12:], [0] * 10)

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), languages=squeeze_1d(["cn", "en"]))
        languages = languages.tolist()
        self.assertEqual(languages[0][:], [1] * len(tokens[0]))
        self.assertEqual(languages[1][:12], [2] * 12)
Example #6
0
    def __init__(
        self,
        add_bos_token: bool,
        add_eos_token: bool,
        use_eos_token_for_bos: bool,
        max_seq_len: int,
        vocab: Vocabulary,
        tokenizer: Optional[Tokenizer],
    ):
        super().__init__()

        if tokenizer is not None and hasattr(tokenizer, "torchscriptify"):
            try:
                self.tokenizer = tokenizer.torchscriptify()
            except NotImplementedError:
                # This is fine as long as the exported tokenizer is only used
                # in pre-tokenized mode
                self.tokenizer = None
        else:
            self.tokenizer = None

        self.do_nothing_tokenizer = ScriptDoNothingTokenizer()
        self.vocab = ScriptVocabulary(
            list(vocab),
            pad_idx=vocab.get_pad_index(),
            bos_idx=vocab.get_bos_index() if add_bos_token else -1,
            eos_idx=vocab.get_eos_index() if add_eos_token else -1,
        )
        self.vocab_lookup_1d = VocabLookup(self.vocab)

        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.use_eos_token_for_bos = use_eos_token_for_bos
        self.max_seq_len = max_seq_len
Example #7
0
 def _mock_vocab(self):
     # mapping of vocab index to token is 0-9
     return ScriptVocabulary(
         [str(i) for i in range(0, 10)],
         pad_idx=-1,
         bos_idx=0,
         unk_idx=-1,
     )
 def _mock_xlm_tensorizer(self, max_seq_len=256):
     return ScriptXLMTensorizer(
         tokenizer=ScriptDoNothingTokenizer(),
         token_vocab=self._mock_vocab(),
         language_vocab=ScriptVocabulary(["ar", "cn", "en"]),
         max_seq_len=256,
         default_language="en",
     )
Example #9
0
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(input_vocab,
                                   unk_idx=input_vocab.idx[UNK])
     self.model = traced_model
     self.output_layer = output_layer
     self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
     self.max_seq_len = jit.Attribute(max_seq_len, int)
Example #10
0
    def torchscriptify(self):
        languages = [0] * (max(list(self.lang2id.values())) + 1)
        for k, v in self.lang2id.items():
            languages[v] = k

        return ScriptXLMTensorizer(
            tokenizer=self.tokenizer.torchscriptify(),
            token_vocab=ScriptVocabulary(
                list(self.vocab),
                pad_idx=self.vocab.get_pad_index(),
                bos_idx=self.vocab.get_eos_index(),
                eos_idx=self.vocab.get_eos_index(),
                unk_idx=self.vocab.get_unk_index(),
            ),
            language_vocab=ScriptVocabulary(languages),
            max_seq_len=self.max_seq_len,
            default_language=self.default_language,
        )
Example #11
0
 def __init__(self, vocab: Vocabulary):
     super().__init__()
     self.vocab = ScriptVocabulary(
         list(vocab),
         pad_idx=vocab.get_pad_index(-1),
         bos_idx=vocab.get_bos_index(-1),
         eos_idx=vocab.get_eos_index(-1),
         unk_idx=vocab.get_unk_index(-1),
     )
Example #12
0
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK])
     self.max_byte_len = jit.Attribute(max_byte_len, int)
     self.byte_offset_for_non_padding = jit.Attribute(
         byte_offset_for_non_padding, int
     )
     self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
     self.model = traced_model
     self.output_layer = output_layer
Example #13
0
 def __init__(
     self,
     tokenizer: Tokenizer,
     vocab: Vocabulary,
     max_seq_len: int,
     language_vocab: List[str],
     default_language: str,
 ):
     super().__init__(tokenizer, vocab, max_seq_len)
     self.language_vocab = ScriptVocabulary(language_vocab)
     self.default_language = torch.jit.Attribute(default_language, str)
Example #14
0
 def torchscriptify(self):
     return ScriptBERTTensorizer(
         tokenizer=self.tokenizer.torchscriptify(),
         vocab=ScriptVocabulary(
             list(self.vocab),
             pad_idx=self.vocab.get_pad_index(),
             bos_idx=self.vocab.get_bos_index(),
             eos_idx=self.vocab.get_eos_index(),
         ),
         max_seq_len=self.max_seq_len,
     )
Example #15
0
def build_pytext_vocab_pipeline(vocab_file):
    tokenizer = BasicEnglishNormalize()
    f = open(vocab_file, 'r')
    vocab_list = [line.rstrip() for line in f]

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextDataPipeline(tokenizer,
                                PyTextVocabTransform(ScriptVocabulary(vocab_list)))
    jit_pipeline = torch.jit.script(pipeline)
    print('jit PyText pipeline success!')
    return pipeline, jit_pipeline
Example #16
0
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(
         input_vocab,
         input_vocab.get_unk_index(),
         input_vocab.get_pad_index(),
     )
     self.model = traced_model
     self.output_layer = output_layer
     self.pad_idx = jit.Attribute(input_vocab.get_pad_index(), int)
     self.max_seq_len = jit.Attribute(max_seq_len, int)
Example #17
0
 def __init__(self, tokenizer: Tokenizer, vocab: Vocabulary, max_seq_len: int):
     super().__init__()
     self.tokenizer = tokenizer
     self.vocab = ScriptVocabulary(
         list(vocab),
         pad_idx=vocab.get_pad_index(),
         bos_idx=vocab.get_bos_index(-1),
         eos_idx=vocab.get_eos_index(-1),
         unk_idx=vocab.get_unk_index(),
     )
     self.vocab_lookup = VocabLookup(self.vocab)
     self.max_seq_len = max_seq_len
Example #18
0
 def torchscriptify(self):
     return ScriptRoBERTaTensorizer(
         tokenizer=self.tokenizer.torchscriptify(),
         vocab=ScriptVocabulary(
             list(self.vocab),
             pad_idx=self.vocab.get_pad_index(),
             bos_idx=self.vocab.get_bos_index(),
             eos_idx=self.vocab.get_eos_index(),
         ),
         max_seq_len=self.max_seq_len,
         add_bos_token=True,
         use_eos_token_for_bos=False,
     )
Example #19
0
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(
         input_vocab,
         input_vocab.get_unk_index(),
         input_vocab.get_pad_index(),
     )
     self.normalizer = tensorizers["dense"].normalizer
     self.model = traced_model
     self.output_layer = output_layer
     self.pad_idx = jit.Attribute(input_vocab.get_pad_index(), int)
     self.max_seq_len = jit.Attribute(max_seq_len, int)
     self.tokenizer = scripted_tokenizer
Example #20
0
def build_pytext_vocab_pipeline(vocab_file):
    from pytext.torchscript.vocab import ScriptVocabulary
    tokenizer = basic_english_normalize()
    f = open(vocab_file, 'r')
    vocab_list = [line.rstrip() for line in f]

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(
        tokenizer, PyTextVocabTransform(ScriptVocabulary(vocab_list)),
        ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit PyText pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Example #21
0
    def __init__(
        self,
        src_dict,
        tgt_dict,
        sequence_generator,
        filter_eos_bos,
        copy_unk_token=False,
        dictfeat_dict=None,
    ):
        super().__init__()
        self.source_vocab = ScriptVocabulary(
            src_dict._vocab,
            src_dict.get_unk_index(),
            bos_idx=src_dict.get_bos_index(-1),
            eos_idx=src_dict.get_eos_index(-1),
        )
        self.target_vocab = ScriptVocabulary(
            tgt_dict._vocab,
            tgt_dict.get_unk_index(),
            bos_idx=tgt_dict.get_bos_index(),
            eos_idx=tgt_dict.get_eos_index(),
        )
        if dictfeat_dict:
            self.dictfeat_vocab = ScriptVocabulary(
                dictfeat_dict._vocab,
                # We want to use the index for the source pad token
                pad_idx=dictfeat_dict.idx[src_dict[src_dict.get_pad_index()]],
            )
        else:
            # Optional types in Torchscript are a bit of a pain, so it's
            # more convenient to have an empty model than use None in
            # this case.
            self.dictfeat_vocab = ScriptVocabulary([])
        self.sequence_generator = sequence_generator

        self.copy_unk_token: bool = copy_unk_token
        self.unk_idx: int = self.source_vocab.unk_idx
        self.filter_eos_bos: bool = filter_eos_bos
Example #22
0
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(
         input_vocab,
         input_vocab.get_unk_index(),
         input_vocab.get_pad_index(),
     )
     self.normalizer = tensorizers["dense"].normalizer
     self.max_seq_len = jit.Attribute(max_seq_len, int)
     self.max_byte_len = jit.Attribute(max_byte_len, int)
     self.byte_offset_for_non_padding = jit.Attribute(
         byte_offset_for_non_padding, int)
     self.pad_idx = jit.Attribute(input_vocab.get_pad_index(), int)
     self.model = traced_model
     self.output_layer = output_layer
Example #23
0
def build_legacy_pytext_script_vocab_pipeline(vocab_file):
    from pytext.torchscript.vocab import ScriptVocabulary

    tokenizer = basic_english_normalize()
    f = open(vocab_file, 'r')

    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)
    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")

    pipeline = TextSequentialTransforms(tokenizer_func(tokenizer),
                                        PyTextScriptVocabTransform(ScriptVocabulary(vocab_list)))
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit legacy PyText pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
Example #24
0
 def generator_setup(self, trg_vocab, beam_size, use_gold_length,
                     beam_ranking_algorithm, config):
     self.trg_vocab = ScriptVocabulary(
         list(trg_vocab),
         pad_idx=trg_vocab.get_pad_index(),
         bos_idx=trg_vocab.get_bos_index(-1),
         eos_idx=trg_vocab.get_eos_index(-1),
         mask_idx=trg_vocab.get_mask_index(),
     )
     self.length_beam_size = beam_size
     self.use_gold_length = use_gold_length
     self.beam_ranking_algorithm = get_beam_ranking_function(
         ranking_algorithm=beam_ranking_algorithm)
     self.clip_target_length = config.clip_target_length
     self.targetlen_cap = config.targetlen_cap
     self.targetlen_a = config.targetlen_a
     self.targetlen_b = config.targetlen_b
     self.targetlen_c = config.targetlen_c
Example #25
0
    def test_xlm_tensorizer_seq_padding_size_exceeds_max_seq_len(self):
        vocab = self._mock_vocab()

        xlm = ScriptXLMTensorizer(
            tokenizer=ScriptDoNothingTokenizer(),
            token_vocab=vocab,
            language_vocab=ScriptVocabulary(["ar", "cn", "en"]),
            max_seq_len=20,
            default_language="en",
        )

        seq_padding_control = [0, 32, 256]
        xlm.set_padding_control("sequence_length", seq_padding_control)

        rand_tokens = [
            [str(random.randint(100, 200)) for i in range(30)],
            [str(random.randint(100, 200)) for i in range(20)],
            [str(random.randint(100, 200)) for i in range(10)],
        ]

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), )

        token_count = [len(t) + 2 for t in rand_tokens]
        expected_batch_size = len(rand_tokens)
        expected_token_size = min(
            max(max(token_count), seq_padding_control[1]), xlm.max_seq_len)
        expected_padding_count = [
            max(0, expected_token_size - cnt) for cnt in token_count
        ]
        token_count = [
            expected_token_size - cnt for cnt in expected_padding_count
        ]

        # verify tensorized tokens padding
        tokens = tokens.tolist()
        self.assertEqual(len(tokens), expected_batch_size)
        self.assertEqual(
            max(len(t) for t in tokens),
            min(len(t) for t in tokens),
            expected_token_size,
        )
        for i in range(expected_batch_size):
            self.assertEqual(tokens[i][token_count[i]:],
                             [200] * expected_padding_count[i])

        # verify tensorized languages
        languages = languages.tolist()
        self.assertEqual(len(languages), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(languages[i][:token_count[i]],
                             [2] * token_count[i])
            self.assertEqual(languages[i][token_count[i]:],
                             [0] * expected_padding_count[i])

        # verify tensorized postions
        positions = positions.tolist()
        self.assertEqual(len(positions), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(positions[i][token_count[i]:],
                             [0] * expected_padding_count[i])

        # verify pad_masks
        pad_masks = pad_masks.tolist()
        self.assertEqual(len(pad_masks), expected_batch_size)
        for i in range(expected_batch_size):
            self.assertEqual(pad_masks[i][:token_count[i]],
                             [1] * token_count[i])
            self.assertEqual(pad_masks[i][token_count[i]:],
                             [0] * expected_padding_count[i])
Example #26
0
    def __init__(
        self,
        config,
        model,
        length_prediction_model,
        trg_vocab,
        beam_size,
        use_gold_length,
        beam_ranking_algorithm,
        quantize,
        embed_quantize,
    ):
        super().__init__()
        length_prediction_model = length_prediction_model.create_eval_module()
        if quantize:
            self.model = torch.quantization.quantize_dynamic(
                model,
                {
                    torch.nn.Linear:
                    torch.quantization.per_channel_dynamic_qconfig
                },
                dtype=torch.qint8,
                inplace=False,
            )
            # embedding quantization
            if embed_quantize != EmbedQuantizeType.NONE:

                # 8-bit embedding quantization
                if embed_quantize == EmbedQuantizeType.BIT_8:
                    ## identify nn.Embedding
                    for module in self.model.modules():
                        if isinstance(module, torch.nn.Embedding):
                            module.qconfig = float_qparams_weight_only_qconfig

                    prepare(self.model, inplace=True)
                    convert(self.model, inplace=True)

                # 4-bit embedding quantization
                elif embed_quantize == EmbedQuantizeType.BIT_4:
                    raise NotImplementedError(
                        "4bit embedding quantization not yet supported")
                else:
                    raise NotImplementedError(
                        "Embedding Quantization should be either 8bit or 4bit")

            self.length_prediction_model = torch.quantization.quantize_dynamic(
                length_prediction_model,
                {
                    torch.nn.Linear:
                    torch.quantization.per_channel_dynamic_qconfig
                },
                dtype=torch.qint8,
                inplace=False,
            )
        else:
            self.model = model
            self.length_prediction_model = length_prediction_model

        self.trg_vocab = ScriptVocabulary(
            list(trg_vocab),
            pad_idx=trg_vocab.get_pad_index(),
            bos_idx=trg_vocab.get_bos_index(-1),
            eos_idx=trg_vocab.get_eos_index(-1),
            mask_idx=trg_vocab.get_mask_index(),
        )
        self.length_beam_size = beam_size
        self.use_gold_length = use_gold_length
        self.beam_ranking_algorithm = get_beam_ranking_function(
            ranking_algorithm=beam_ranking_algorithm)
        self.clip_target_length = config.clip_target_length
        self.targetlen_cap = config.targetlen_cap
        self.targetlen_a = config.targetlen_a
        self.targetlen_b = config.targetlen_b
        self.targetlen_c = config.targetlen_c
Example #27
0
 def setUp(self):
     vocab_list = ["UNK", "a", "b", "c", "d"]
     self.vocab = ScriptVocabulary(vocab_list)
Example #28
0
 def test_custom_unk(self):
     vocab_list = ["a", "UNK", "b", "c", "d"]
     vocab = ScriptVocabulary(vocab_list, unk_idx=1)
     self.assertEqual([0, 1, 3, 4], vocab.lookup_indices_1d(["a", "e", "c", "d"]))
Example #29
0
    def __init__(self, label_names: List[str]):
        super().__init__()

        self.label_vocab = ScriptVocabulary(sorted(label_names))
Example #30
0
 def _mock_vocab(self):
     # mapping of vocab index to token is x: x + 100
     return ScriptVocabulary([str(i) for i in range(100, 203)],
                             pad_idx=200,
                             bos_idx=201,
                             eos_idx=202)