コード例 #1
0
    def __init__(
        self,
        vocab_path: Optional[str] = None,
        vocab_list: Optional[List[str]] = None,
        special_token_replacements=SPECIAL_TOKEN_REPLACEMENT,
    ):
        super().__init__()
        assert vocab_path or vocab_list, "vocab_path or vocab_list is required"
        assert not (
            vocab_path and vocab_list
        ), "vocab_path and vocab_list are mutual exclusive"

        if vocab_list:
            self.vocab = ScriptVocabulary(vocab_list)
        else:
            with PathManager.open(vocab_path) as f:
                vocab = build_fairseq_vocab(
                    f, special_token_replacements=special_token_replacements
                )
                self.vocab = ScriptVocabulary(
                    list(vocab),
                    pad_idx=vocab.get_pad_index(-1),
                    bos_idx=vocab.get_bos_index(-1),
                    eos_idx=vocab.get_eos_index(-1),
                    unk_idx=vocab.get_unk_index(-1),
                    unk_token=vocab.unk_token,
                )
コード例 #2
0
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK])
     self.normalizer = tensorizers["dense"].normalizer
     self.model = traced_model
     self.output_layer = output_layer
     self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
コード例 #3
0
ファイル: transforms.py プロジェクト: terrorizer1980/pytext
    def __init__(
        self, vocab_path: Optional[str] = None, vocab_list: Optional[List[str]] = None
    ):
        super().__init__()
        assert vocab_path or vocab_list, "vocab_path or vocab_list is required"
        assert not (
            vocab_path and vocab_list
        ), "vocab_path and vocab_list are mutual exclusive"

        if vocab_list:
            self.vocab = ScriptVocabulary(vocab_list)
        else:
            with PathManager.open(vocab_path) as f:
                special_token_replacements = {
                    "[UNK]": UNK,
                    "[PAD]": PAD,
                    "[CLS]": BOS,
                    "[MASK]": MASK,
                    "[SEP]": EOS,
                }
                vocab = build_fairseq_vocab(
                    f, special_token_replacements=special_token_replacements
                )
                self.vocab = ScriptVocabulary(
                    list(vocab),
                    pad_idx=vocab.get_pad_index(-1),
                    bos_idx=vocab.get_bos_index(-1),
                    eos_idx=vocab.get_eos_index(-1),
                    unk_idx=vocab.get_unk_index(-1),
                )
コード例 #4
0
    def __init__(
        self,
        vocab_path: Optional[str] = None,
        vocab_list: Optional[List[str]] = None,
        special_token_replacements=SPECIAL_TOKEN_REPLACEMENT,
        add_bos: bool = False,
        add_eos: bool = False,
        max_seq_len: int = 2**30,
    ):
        super().__init__()
        assert vocab_path or vocab_list, "vocab_path or vocab_list is required"
        assert not (vocab_path and vocab_list
                    ), "vocab_path and vocab_list are mutual exclusive"

        if vocab_list:
            self.vocab = ScriptVocabulary(vocab_list)
        else:
            with PathManager.open(vocab_path) as f:
                vocab = build_fairseq_vocab(
                    f, special_token_replacements=special_token_replacements)
                self.vocab = ScriptVocabulary(
                    list(vocab),
                    pad_idx=vocab.get_pad_index(-1),
                    bos_idx=vocab.get_bos_index(-1),
                    eos_idx=vocab.get_eos_index(-1),
                    unk_idx=vocab.get_unk_index(-1),
                    unk_token=vocab.unk_token,
                )
        # TODO T77728853 We need to combine truncate with BOS/EOS as they impact each other
        # Need to find a nicer way to do this, as this can't be chained.
        self.add_bos = add_bos
        self.add_eos = add_eos
        # Make room for bos and eos from max_seq_len if true
        self.truncate_transform = TruncateTransform(max_seq_len - add_bos -
                                                    add_eos)
コード例 #5
0
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(input_vocab,
                                   unk_idx=input_vocab.idx[UNK])
     self.model = traced_model
     self.output_layer = output_layer
     self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
     self.max_seq_len = jit.Attribute(max_seq_len, int)
コード例 #6
0
 def __init__(self, vocab: Vocabulary):
     super().__init__()
     self.vocab = ScriptVocabulary(
         list(vocab),
         pad_idx=vocab.get_pad_index(-1),
         bos_idx=vocab.get_bos_index(-1),
         eos_idx=vocab.get_eos_index(-1),
         unk_idx=vocab.get_unk_index(-1),
     )
コード例 #7
0
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK])
     self.max_byte_len = jit.Attribute(max_byte_len, int)
     self.byte_offset_for_non_padding = jit.Attribute(
         byte_offset_for_non_padding, int
     )
     self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
     self.model = traced_model
     self.output_layer = output_layer
コード例 #8
0
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(
         input_vocab,
         input_vocab.get_unk_index(),
         input_vocab.get_pad_index(),
     )
     self.model = traced_model
     self.output_layer = output_layer
     self.pad_idx = jit.Attribute(input_vocab.get_pad_index(), int)
     self.max_seq_len = jit.Attribute(max_seq_len, int)
コード例 #9
0
 def _run_benchmark_pytext_script_vocab(toks, v: PytextScriptVocabulary):
     # list lookup
     if isinstance(toks, list) and isinstance(toks[0], list):
         for tokens_list in toks:
             v.lookup_indices_1d(tokens_list)
     # single token lookup
     elif isinstance(toks, list):
         for token in toks:
             v.lookup_indices_1d([token])
     else:
         raise RuntimeError("Received tokens of incorrect type {}.".format(
             type(toks)))
コード例 #10
0
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(
         input_vocab,
         input_vocab.get_unk_index(),
         input_vocab.get_pad_index(),
     )
     self.normalizer = tensorizers["dense"].normalizer
     self.model = traced_model
     self.output_layer = output_layer
     self.pad_idx = jit.Attribute(input_vocab.get_pad_index(), int)
     self.max_seq_len = jit.Attribute(max_seq_len, int)
     self.tokenizer = scripted_tokenizer
コード例 #11
0
ファイル: doc_model.py プロジェクト: xiaoanshi/pytext
    def __init__(
        self,
        pretrained_embeddings_path: str,
        embedding_dim: int,
        mlp_layer_dims: Optional[Sequence[int]] = None,
        lowercase_tokens: bool = False,
        skip_header: bool = True,
        delimiter: str = " ",
        vocab: ScriptVocabulary = None,
    ) -> None:
        super().__init__()
        vocab = vocab or build_vocab(pretrained_embeddings_path)
        pretrained_embedding = PretrainedEmbedding(
            pretrained_embeddings_path,
            lowercase_tokens=lowercase_tokens,
            skip_header=skip_header,
            delimiter=delimiter,
        )
        embeddings_weight = pretrained_embedding.initialize_embeddings_weights(
            vocab.idx,  # tensorizer.vocab.idx,
            vocab.unk_token,  # tensorizer.vocab.unk_token,
            embedding_dim,
            EmbedInitStrategy.RANDOM,
        )
        num_embeddings = len(vocab.idx)

        self.embedding = nn.Embedding(
            num_embeddings,
            embedding_dim,
            _weight=embeddings_weight,
            padding_idx=vocab.get_pad_index(),
        )

        # Initialize unk embedding with zeros
        # to guard the model against randomized decisions based on unknown words
        unk_token_idx = vocab.get_unk_index()
        if unk_token_idx >= 0:
            self.embedding.weight.data[unk_token_idx].fill_(0.0)

        # Create MLP layers
        if mlp_layer_dims is None:
            mlp_layer_dims = []

        self.mlp = nn.Sequential(
            *(
                nn.Sequential(nn.Linear(m, n), nn.ReLU())
                for m, n in zip([embedding_dim] + list(mlp_layer_dims), mlp_layer_dims)
            )
        )
        self.output_dim = mlp_layer_dims[-1] if mlp_layer_dims else embedding_dim
コード例 #12
0
ファイル: doc_model.py プロジェクト: morganzwest/pytext
 def __init__(self):
     super().__init__()
     self.vocab = ScriptVocabulary(
         input_vocab,
         input_vocab.get_unk_index(),
         input_vocab.get_pad_index(),
     )
     self.normalizer = tensorizers["dense"].normalizer
     self.max_seq_len = jit.Attribute(max_seq_len, int)
     self.max_byte_len = jit.Attribute(max_byte_len, int)
     self.byte_offset_for_non_padding = jit.Attribute(
         byte_offset_for_non_padding, int)
     self.pad_idx = jit.Attribute(input_vocab.get_pad_index(), int)
     self.model = traced_model
     self.output_layer = output_layer
コード例 #13
0
        class ModelWithDenseFeat(jit.ScriptModule):
            def __init__(self):
                super().__init__()
                self.vocab = ScriptVocabulary(input_vocab, unk_idx=input_vocab.idx[UNK])
                self.normalizer = tensorizers["dense"].normalizer
                self.max_byte_len = jit.Attribute(max_byte_len, int)
                self.byte_offset_for_non_padding = jit.Attribute(
                    byte_offset_for_non_padding, int
                )
                self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
                self.model = traced_model
                self.output_layer = output_layer

            @jit.script_method
            def forward(self, tokens: List[List[str]], dense_feat: List[List[float]]):
                seq_lens = make_sequence_lengths(tokens)
                word_ids = self.vocab.lookup_indices_2d(tokens)
                word_ids = pad_2d(word_ids, seq_lens, self.pad_idx)
                token_bytes, _ = make_byte_inputs(
                    tokens, self.max_byte_len, self.byte_offset_for_non_padding
                )
                dense_feat = self.normalizer.normalize(dense_feat)
                logits = self.model(
                    torch.tensor(word_ids),
                    token_bytes,
                    torch.tensor(seq_lens),
                    torch.tensor(dense_feat, dtype=torch.float),
                )
                return self.output_layer(logits)
コード例 #14
0
    def __init__(
        self,
        add_bos_token: bool,
        add_eos_token: bool,
        use_eos_token_for_bos: bool,
        max_seq_len: int,
        vocab: Vocabulary,
        tokenizer: Optional[Tokenizer],
    ):
        super().__init__()

        if tokenizer is not None and hasattr(tokenizer, "torchscriptify"):
            try:
                self.tokenizer = tokenizer.torchscriptify()
            except NotImplementedError:
                # This is fine as long as the exported tokenizer is only used
                # in pre-tokenized mode
                self.tokenizer = None
        else:
            self.tokenizer = None

        self.do_nothing_tokenizer = ScriptDoNothingTokenizer()
        self.vocab = ScriptVocabulary(
            list(vocab),
            pad_idx=vocab.get_pad_index(),
            bos_idx=vocab.get_bos_index() if add_bos_token else -1,
            eos_idx=vocab.get_eos_index() if add_eos_token else -1,
        )
        self.vocab_lookup_1d = VocabLookup(self.vocab)

        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.use_eos_token_for_bos = use_eos_token_for_bos
        self.max_seq_len = max_seq_len
コード例 #15
0
class VocabTransform(nn.Module):
    def __init__(
        self,
        vocab_path: Optional[str] = None,
        vocab_list: Optional[List[str]] = None,
        special_token_replacements=SPECIAL_TOKEN_REPLACEMENT,
    ):
        super().__init__()
        assert vocab_path or vocab_list, "vocab_path or vocab_list is required"
        assert not (
            vocab_path and vocab_list
        ), "vocab_path and vocab_list are mutual exclusive"

        if vocab_list:
            self.vocab = ScriptVocabulary(vocab_list)
        else:
            with PathManager.open(vocab_path) as f:
                vocab = build_fairseq_vocab(
                    f, special_token_replacements=special_token_replacements
                )
                self.vocab = ScriptVocabulary(
                    list(vocab),
                    pad_idx=vocab.get_pad_index(-1),
                    bos_idx=vocab.get_bos_index(-1),
                    eos_idx=vocab.get_eos_index(-1),
                    unk_idx=vocab.get_unk_index(-1),
                    unk_token=vocab.unk_token,
                )

    def forward(self, tokens: List[List[str]]) -> List[List[int]]:
        return self.vocab.lookup_indices_2d(tokens)
コード例 #16
0
        class ModelWithDenseFeat(jit.ScriptModule):
            def __init__(self):
                super().__init__()
                self.vocab = ScriptVocabulary(input_vocab,
                                              unk_idx=input_vocab.idx[UNK])
                self.normalizer = tensorizers["dense"].normalizer
                self.model = traced_model
                self.output_layer = output_layer
                self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)

            @jit.script_method
            def forward(
                self,
                texts: Optional[List[str]] = None,
                tokens: Optional[List[List[str]]] = None,
                languages: Optional[List[str]] = None,
                dense_feat: Optional[List[List[float]]] = None,
            ):
                if tokens is None:
                    raise RuntimeError("tokens is required")

                seq_lens = make_sequence_lengths(tokens)
                word_ids = self.vocab.lookup_indices_2d(tokens)
                word_ids = pad_2d(word_ids, seq_lens, self.pad_idx)
                if dense_feat is not None:
                    dense_feat = self.normalizer.normalize(dense_feat)
                else:
                    raise RuntimeError("dense is required")
                logits = self.model(
                    torch.tensor(word_ids),
                    torch.tensor(seq_lens),
                    torch.tensor(dense_feat, dtype=torch.float),
                )
                return self.output_layer(logits)
コード例 #17
0
        class Model(jit.ScriptModule):
            def __init__(self):
                super().__init__()
                self.vocab = ScriptVocabulary(input_vocab,
                                              unk_idx=input_vocab.idx[UNK])
                self.model = traced_model
                self.output_layer = output_layer
                self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
                self.max_seq_len = jit.Attribute(max_seq_len, int)

            @jit.script_method
            def forward(
                self,
                texts: Optional[List[str]] = None,
                tokens: Optional[List[List[str]]] = None,
                languages: Optional[List[str]] = None,
            ):
                if tokens is None:
                    raise RuntimeError("tokens is required")

                trimmed_tokens: List[List[str]] = []
                if self.max_seq_len >= 0:
                    for token in tokens:
                        trimmed_tokens.append(token[0:self.max_seq_len])
                else:
                    trimmed_tokens = tokens

                seq_lens = make_sequence_lengths(trimmed_tokens)
                word_ids = self.vocab.lookup_indices_2d(trimmed_tokens)
                word_ids = pad_2d(word_ids, seq_lens, self.pad_idx)
                logits = self.model(torch.tensor(word_ids),
                                    torch.tensor(seq_lens))
                return self.output_layer(logits)
コード例 #18
0
ファイル: test_tensorizer.py プロジェクト: nadileaf/pytext
    def test_xlm_token_tensorizer(self):
        vocab = self._mock_vocab()

        xlm = ScriptXLMTensorizer(
            tokenizer=ScriptDoNothingTokenizer(),
            token_vocab=vocab,
            language_vocab=ScriptVocabulary(["ar", "cn", "en"]),
            max_seq_len=256,
            default_language="en",
        )
        rand_tokens = [
            [str(random.randint(100, 200)) for i in range(20)],
            [str(random.randint(100, 200)) for i in range(10)],
        ]

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens))
        tokens = tokens.tolist()
        # eos token
        self.assertEqual(tokens[0][0], 202)
        self.assertEqual(tokens[0][-1], 202)
        # pad token
        self.assertEqual(tokens[1][12:], [200] * 10)

        languages = languages.tolist()
        self.assertEqual(languages[0], [2] * len(tokens[0]))
        self.assertEqual(languages[1][12:], [0] * 10)

        tokens, pad_masks, languages, positions = xlm.tensorize(
            tokens=squeeze_2d(rand_tokens), languages=squeeze_1d(["cn", "en"]))
        languages = languages.tolist()
        self.assertEqual(languages[0][:], [1] * len(tokens[0]))
        self.assertEqual(languages[1][:12], [2] * 12)
コード例 #19
0
        class Model(jit.ScriptModule):
            def __init__(self):
                super().__init__()
                self.vocab = ScriptVocabulary(input_vocab,
                                              unk_idx=input_vocab.idx[UNK])
                self.max_byte_len = jit.Attribute(max_byte_len, int)
                self.byte_offset_for_non_padding = jit.Attribute(
                    byte_offset_for_non_padding, int)
                self.pad_idx = jit.Attribute(input_vocab.idx[PAD], int)
                self.model = traced_model
                self.output_layer = output_layer

            @jit.script_method
            def forward(
                self,
                texts: Optional[List[str]] = None,
                tokens: Optional[List[List[str]]] = None,
                languages: Optional[List[str]] = None,
            ):
                if tokens is None:
                    raise RuntimeError("tokens is required")
                seq_lens = make_sequence_lengths(tokens)
                word_ids = self.vocab.lookup_indices_2d(tokens)
                word_ids = pad_2d(word_ids, seq_lens, self.pad_idx)
                token_bytes, _ = make_byte_inputs(
                    tokens, self.max_byte_len,
                    self.byte_offset_for_non_padding)
                logits = self.model(torch.tensor(word_ids), token_bytes,
                                    torch.tensor(seq_lens))
                return self.output_layer(logits)
コード例 #20
0
        class Model(jit.ScriptModule):
            def __init__(self):
                super().__init__()
                self.vocab = ScriptVocabulary(
                    input_vocab,
                    input_vocab.get_unk_index(),
                    input_vocab.get_pad_index(),
                )
                self.model = traced_model
                self.output_layer = output_layer
                self.pad_idx = jit.Attribute(input_vocab.get_pad_index(), int)
                self.max_seq_len = jit.Attribute(max_seq_len, int)

            @jit.script_method
            def forward(
                self,
                texts: Optional[List[str]] = None,
                multi_texts: Optional[List[List[str]]] = None,
                tokens: Optional[List[List[str]]] = None,
                languages: Optional[List[str]] = None,
            ):
                if tokens is None:
                    raise RuntimeError("tokens is required")

                tokens = truncate_tokens(tokens, self.max_seq_len, self.vocab.pad_token)
                seq_lens = make_sequence_lengths(tokens)
                word_ids = self.vocab.lookup_indices_2d(tokens)
                word_ids = pad_2d(word_ids, seq_lens, self.pad_idx)
                logits = self.model(torch.tensor(word_ids), torch.tensor(seq_lens))
                return self.output_layer(logits)
コード例 #21
0
ファイル: test_module.py プロジェクト: yinghai/pytext
 def _mock_vocab(self):
     # mapping of vocab index to token is 0-9
     return ScriptVocabulary(
         [str(i) for i in range(0, 10)],
         pad_idx=-1,
         bos_idx=0,
         unk_idx=-1,
     )
コード例 #22
0
 def _mock_xlm_tensorizer(self, max_seq_len=256):
     return ScriptXLMTensorizer(
         tokenizer=ScriptDoNothingTokenizer(),
         token_vocab=self._mock_vocab(),
         language_vocab=ScriptVocabulary(["ar", "cn", "en"]),
         max_seq_len=256,
         default_language="en",
     )
コード例 #23
0
class LabelTransform(nn.Module):
    def __init__(self, label_names: List[str]):
        super().__init__()

        self.label_vocab = ScriptVocabulary(sorted(label_names))

    def forward(self, labels: List[str]) -> List[int]:
        return self.label_vocab.lookup_indices_1d(labels)
コード例 #24
0
    def torchscriptify(self):
        languages = [0] * (max(list(self.lang2id.values())) + 1)
        for k, v in self.lang2id.items():
            languages[v] = k

        return ScriptXLMTensorizer(
            tokenizer=self.tokenizer.torchscriptify(),
            token_vocab=ScriptVocabulary(
                list(self.vocab),
                pad_idx=self.vocab.get_pad_index(),
                bos_idx=self.vocab.get_eos_index(),
                eos_idx=self.vocab.get_eos_index(),
                unk_idx=self.vocab.get_unk_index(),
            ),
            language_vocab=ScriptVocabulary(languages),
            max_seq_len=self.max_seq_len,
            default_language=self.default_language,
        )
コード例 #25
0
 def torchscriptify(self):
     return ScriptBERTTensorizer(
         tokenizer=self.tokenizer.torchscriptify(),
         vocab=ScriptVocabulary(
             list(self.vocab),
             pad_idx=self.vocab.get_pad_index(),
             bos_idx=self.vocab.get_bos_index(),
             eos_idx=self.vocab.get_eos_index(),
         ),
         max_seq_len=self.max_seq_len,
     )
コード例 #26
0
ファイル: xlm_tensorizer.py プロジェクト: twwhatever/pytext
 def __init__(
     self,
     tokenizer: Tokenizer,
     vocab: Vocabulary,
     max_seq_len: int,
     language_vocab: List[str],
     default_language: str,
 ):
     super().__init__(tokenizer, vocab, max_seq_len)
     self.language_vocab = ScriptVocabulary(language_vocab)
     self.default_language = torch.jit.Attribute(default_language, str)
コード例 #27
0
ファイル: pipelines.py プロジェクト: whitemike889/text
def build_pytext_vocab_pipeline(vocab_file):
    tokenizer = BasicEnglishNormalize()
    f = open(vocab_file, 'r')
    vocab_list = [line.rstrip() for line in f]

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextDataPipeline(tokenizer,
                                PyTextVocabTransform(ScriptVocabulary(vocab_list)))
    jit_pipeline = torch.jit.script(pipeline)
    print('jit PyText pipeline success!')
    return pipeline, jit_pipeline
コード例 #28
0
ファイル: bert_tensorizer.py プロジェクト: nadileaf/pytext
 def __init__(self, tokenizer: Tokenizer, vocab: Vocabulary, max_seq_len: int):
     super().__init__()
     self.tokenizer = tokenizer
     self.vocab = ScriptVocabulary(
         list(vocab),
         pad_idx=vocab.get_pad_index(),
         bos_idx=vocab.get_bos_index(-1),
         eos_idx=vocab.get_eos_index(-1),
         unk_idx=vocab.get_unk_index(),
     )
     self.vocab_lookup = VocabLookup(self.vocab)
     self.max_seq_len = max_seq_len
コード例 #29
0
def build_pytext_vocab_pipeline(vocab_file):
    from pytext.torchscript.vocab import ScriptVocabulary
    tokenizer = basic_english_normalize()
    f = open(vocab_file, 'r')
    vocab_list = [line.rstrip() for line in f]

    # Insert token in vocab to match a pretrained vocab
    pipeline = TextSequentialTransforms(
        tokenizer, PyTextVocabTransform(ScriptVocabulary(vocab_list)),
        ToLongTensor())
    jit_pipeline = torch.jit.script(pipeline.to_ivalue())
    print('jit PyText pipeline success!')
    return pipeline, pipeline.to_ivalue(), jit_pipeline
コード例 #30
0
 def torchscriptify(self):
     return ScriptRoBERTaTensorizer(
         tokenizer=self.tokenizer.torchscriptify(),
         vocab=ScriptVocabulary(
             list(self.vocab),
             pad_idx=self.vocab.get_pad_index(),
             bos_idx=self.vocab.get_bos_index(),
             eos_idx=self.vocab.get_eos_index(),
         ),
         max_seq_len=self.max_seq_len,
         add_bos_token=True,
         use_eos_token_for_bos=False,
     )