Python Vocabulary.Vocabularyの例、pytext.data.utils.Vocabulary.Vocabulary Pythonの例

コード例 #1

0

ファイルを表示

 def from_config(cls, config: Config, **kwargs):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     replacements = {
         config.unk_token: UNK,
         config.pad_token: PAD,
         config.bos_token: BOS,
         config.eos_token: EOS,
         config.mask_token: MASK,
     }
     if isinstance(tokenizer, WordPieceTokenizer):
         vocab = Vocabulary(
             [token for token, _ in tokenizer.vocab.items()],
             replacements=replacements,
         )
     else:
         dictionary = BertDictionary.load(config.vocab_file)
         vocab = Vocabulary(
             dictionary.symbols, dictionary.count, replacements=replacements
         )
     return cls(
         columns=config.columns,
         tokenizer=tokenizer,
         add_bos_token=config.add_bos_token,
         add_eos_token=config.add_eos_token,
         use_eos_token_for_bos=config.use_eos_token_for_bos,
         max_seq_len=config.max_seq_len,
         vocab=vocab,
         **kwargs,
     )

コード例 #2

0

ファイルを表示

ファイル: output_layer_test.py プロジェクト: marskong/pytext

    def test_torchscript_intent_slot_output_layer(self, num_doc_labels,
                                                  num_word_labels, seq_lens):
        batch_size = len(seq_lens)
        doc_vocab = Vocabulary([
            OutputLayerTest._generate_random_string()
            for _ in range(num_doc_labels)
        ])
        word_vocab = Vocabulary([
            OutputLayerTest._generate_random_string()
            for _ in range(num_word_labels)
        ])
        intent_slot_output_layer = IntentSlotOutputLayer.from_config(
            config=IntentSlotOutputLayer.Config(),
            doc_labels=doc_vocab,
            word_labels=word_vocab,
        )
        doc_logits = OutputLayerTest._generate_doc_classification_inputs(
            batch_size, num_doc_labels)
        word_logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs(
            batch_size, num_word_labels, seq_lens)
        context = {"seq_lens": seq_lens_tensor}
        torchscript_output_layer = intent_slot_output_layer.torchscript_predictions(
        )

        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_logits), None, context)[1]
        ts_output = torchscript_output_layer((doc_logits, word_logits),
                                             seq_lens_tensor)

        self._validate_doc_classification_result(pt_output[0], ts_output[0],
                                                 doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1],
                                           word_vocab)

        (
            word_bpe_logits,
            seq_lens_tensor,
            token_indices_tensor,
        ) = OutputLayerTest._generate_bpe_tagging_inputs(
            batch_size, num_word_labels, seq_lens)
        context = {
            "seq_lens": seq_lens_tensor,
            "token_indices": token_indices_tensor
        }
        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_bpe_logits), None, context)[1]
        ts_output = torchscript_output_layer((doc_logits, word_bpe_logits),
                                             seq_lens_tensor,
                                             token_indices_tensor)

        self._validate_doc_classification_result(pt_output[0], ts_output[0],
                                                 doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1],
                                           word_vocab)

コード例 #3

0

ファイルを表示

ファイル: output_layer_test.py プロジェクト: a-domingu/tbcnn

    def test_torchscript_intent_slot_output_layer(
        self, num_doc_labels, num_word_labels, seq_lens
    ):
        batch_size = len(seq_lens)
        doc_vocab = Vocabulary(
            [OutputLayerTest._generate_random_string() for _ in range(num_doc_labels)]
        )
        word_vocab = Vocabulary(
            [OutputLayerTest._generate_random_string() for _ in range(num_word_labels)]
        )
        intent_slot_output_layer = IntentSlotOutputLayer.from_config(
            config=IntentSlotOutputLayer.Config(),
            doc_labels=doc_vocab,
            word_labels=word_vocab,
        )
        doc_logits = OutputLayerTest._generate_doc_classification_inputs(
            batch_size, num_doc_labels
        )
        word_logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs(
            batch_size, num_word_labels, seq_lens
        )
        context = {"seq_lens": seq_lens_tensor}
        torchscript_output_layer = intent_slot_output_layer.torchscript_predictions()

        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_logits), None, context
        )[1]
        with redirect_stdout() as redirected_stdout:
            ts_output = torchscript_output_layer((doc_logits, word_logits), context)
            buffer = redirected_stdout.getvalue()
            assert (
                "Implicit dimension choice for log_softmax has been deprecated"
                not in buffer
            )

        self._validate_doc_classification_result(pt_output[0], ts_output[0], doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1], word_vocab)

        (
            word_bpe_logits,
            seq_lens_tensor,
            token_indices_tensor,
        ) = OutputLayerTest._generate_bpe_tagging_inputs(
            batch_size, num_word_labels, seq_lens
        )
        context = {"seq_lens": seq_lens_tensor, "token_indices": token_indices_tensor}
        pt_output = intent_slot_output_layer.get_pred(
            (doc_logits, word_bpe_logits), None, context
        )[1]
        ts_output = torchscript_output_layer((doc_logits, word_bpe_logits), context)

        self._validate_doc_classification_result(pt_output[0], ts_output[0], doc_vocab)
        self._validate_word_tagging_result(pt_output[1], ts_output[1], word_vocab)

コード例 #4

0

ファイルを表示

    def test_wordblstm_export_to_caffe2(self, export_num_words,
                                        num_word_classes, test_num_words,
                                        num_predictions):
        for WORD_CONFIG in WORD_CONFIGS:
            config = self._get_config(WordTaggingTask.Config, WORD_CONFIG)
            tensorizers, data = _NewTask._init_tensorizers(config)
            word_labels = [
                SpecialTokens.PAD, SpecialTokens.UNK, "NoLabel", "person"
            ]
            tensorizers["labels"].vocab = Vocabulary(word_labels)
            tensorizers["tokens"].vocab = Vocabulary(WORD_VOCAB)
            py_model = _NewTask._init_model(config.model, tensorizers)
            dummy_test_input = self._get_rand_input_intent_slot(
                BATCH_SIZE, W_VOCAB_SIZE, test_num_words)
            exporter = ModelExporter(
                ModelExporter.Config(),
                py_model.get_export_input_names(tensorizers),
                dummy_test_input,
                py_model.vocab_to_export(tensorizers),
                py_model.get_export_output_names(tensorizers),
            )
            with tempfile.NamedTemporaryFile(
                    delete=False,
                    suffix=".{}".format(".predictor")) as pred_file:
                exporter.export_to_caffe2(py_model, pred_file.name)
                workspace.ResetWorkspace()
            pred_net = pe.prepare_prediction_net(pred_file.name,
                                                 CAFFE2_DB_TYPE)
            for _i in range(num_predictions):
                test_inputs = self._get_rand_input_intent_slot(
                    BATCH_SIZE, W_VOCAB_SIZE, test_num_words)
                self._feed_c2_input(workspace, test_inputs,
                                    exporter.input_names, exporter.vocab_map)
                workspace.RunNetOnce(pred_net)
                word_output_names = [
                    "{}:{}".format("word_scores", class_name)
                    for class_name in word_labels
                ]
                py_model.eval()
                py_outs = py_model(*test_inputs)
                context = {"seq_lens": test_inputs[-1]}
                target = None
                pred, score = py_model.get_pred(py_outs, target, context)
                c2_word_out = []
                for o_name in word_output_names:
                    c2_word_out.extend(list(workspace.FetchBlob(o_name)))

                np.testing.assert_array_almost_equal(
                    torch.transpose(score, 1,
                                    2).contiguous().view(-1).detach().numpy(),
                    np.array(c2_word_out).flatten(),
                )

コード例 #5

0

ファイルを表示

    def test_seq_nn_export_to_caffe2(
        self,
        export_num_words,
        num_doc_classes,
        test_num_words,
        num_predictions,
        test_num_seq,
    ):
        config = self._get_config(SeqNNTask.Config, SEQ_NN_CONFIG)
        tensorizers, data = _NewTask._init_tensorizers(config)
        doc_labels = [SpecialTokens.UNK, "cu:other", "cu:address_Person"]
        tensorizers["labels"].vocab = Vocabulary(doc_labels)
        tensorizers["tokens"].vocab = Vocabulary(WORD_VOCAB)
        py_model = _NewTask._init_model(config.model, tensorizers)
        dummy_test_input = self._get_seq_nn_rand_input(BATCH_SIZE,
                                                       W_VOCAB_SIZE,
                                                       test_num_words,
                                                       test_num_seq)
        exporter = ModelExporter(
            ModelExporter.Config(),
            py_model.get_export_input_names(tensorizers),
            dummy_test_input,
            py_model.vocab_to_export(tensorizers),
            py_model.get_export_output_names(tensorizers),
        )
        with tempfile.NamedTemporaryFile(
                delete=False, suffix=".{}".format(".predictor")) as pred_file:
            output_names = exporter.export_to_caffe2(py_model, pred_file.name)
            workspace.ResetWorkspace()

        pred_net = pe.prepare_prediction_net(pred_file.name, CAFFE2_DB_TYPE)
        for _i in range(num_predictions):
            test_inputs = self._get_seq_nn_rand_input(BATCH_SIZE, W_VOCAB_SIZE,
                                                      test_num_words,
                                                      test_num_seq)
            self._feed_c2_input(workspace, test_inputs, exporter.input_names,
                                exporter.vocab_map)
            workspace.RunNetOnce(pred_net)
            c2_out = [
                list(workspace.FetchBlob(o_name)) for o_name in output_names
            ]

            py_model.eval()
            py_outs = py_model(*test_inputs)
            # Do log_softmax since we do that before exporting predictor nets
            py_outs = F.log_softmax(py_outs, 1)
            np.testing.assert_array_almost_equal(
                py_outs.view(-1).detach().numpy(),
                np.array(c2_out).flatten())

コード例 #6

0

ファイルを表示

ファイル: tensorizers_test.py プロジェクト: twild-fb/pytext

    def test_lookup_tokens(self):
        text = "let's tokenize this"
        tokenizer = Tokenizer()
        vocab = Vocabulary(text.split() + [BOS, EOS])
        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            add_bos_token=False,
            add_eos_token=False,
        )
        self.assertEqual(tokens, [0, 1, 2])
        self.assertEqual(start_idx, (0, 6, 15))
        self.assertEqual(end_idx, (5, 14, 19))

        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            add_bos_token=True,
            add_eos_token=True,
        )
        self.assertEqual(tokens, [3, 0, 1, 2, 4])
        self.assertEqual(start_idx, (-1, 0, 6, 15, -1))
        self.assertEqual(end_idx, (-1, 5, 14, 19, -1))

コード例 #7

0

ファイルを表示

ファイル: bert_tensorizer.py プロジェクト: nadileaf/pytext

 def from_config(cls, config: Config, **kwargs):
     """
     from_config parses the config associated with the tensorizer and
     creates both the tokenizer and the Vocabulary object. The extra arguments
     passed as kwargs allow us to reuse thie function with variable number
     of arguments (eg: for classes which derive from this class).
     """
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     special_token_replacements = {
         "[UNK]": UNK,
         "[PAD]": PAD,
         "[CLS]": BOS,
         "[MASK]": MASK,
         "[SEP]": EOS,
     }
     if isinstance(tokenizer, WordPieceTokenizer):
         vocab = Vocabulary(
             [token for token, _ in tokenizer.vocab.items()],
             replacements=special_token_replacements,
         )
     else:
         with PathManager.open(config.vocab_file) as file_path:
             vocab = build_fairseq_vocab(
                 dictionary_class=BertDictionary,
                 vocab_file=file_path,
                 special_token_replacements=special_token_replacements,
             )
     return cls(
         columns=config.columns,
         vocab=vocab,
         tokenizer=tokenizer,
         max_seq_len=config.max_seq_len,
         **kwargs,
     )

コード例 #8

0

ファイルを表示

ファイル: output_layer_test.py プロジェクト: a-domingu/tbcnn

    def test_torchscript_word_tagging_output_layer(self, num_labels, seq_lens):
        batch_size = len(seq_lens)
        vocab = Vocabulary(
            [OutputLayerTest._generate_random_string() for _ in range(num_labels)]
        )

        word_layer = WordTaggingOutputLayer.from_config(
            config=WordTaggingOutputLayer.Config(), labels=vocab
        )
        crf_layer = CRFOutputLayer.from_config(
            config=CRFOutputLayer.Config(), labels=vocab
        )

        logits, seq_lens_tensor = OutputLayerTest._generate_word_tagging_inputs(
            batch_size, num_labels, seq_lens
        )
        context = {"seq_lens": seq_lens_tensor}

        torchsript_word_layer = word_layer.torchscript_predictions()
        torchscript_crf_layer = crf_layer.torchscript_predictions()

        self._validate_word_tagging_result(
            word_layer.get_pred(logits, None, context)[1],
            torchsript_word_layer(logits, context),
            vocab,
        )
        self._validate_word_tagging_result(
            crf_layer.get_pred(logits, None, context)[1],
            torchscript_crf_layer(logits, context),
            vocab,
        )

コード例 #9

0

ファイルを表示

ファイル: bert_tensorizer.py プロジェクト: theniteshsingh/pytext

def build_fairseq_vocab(
    vocab_file: str,
    dictionary_class: Dictionary = Dictionary,
    special_token_replacements: Dict[str, Token] = None,
    max_vocab: int = -1,
    min_count: int = -1,
    tokens_to_add: Optional[List[str]] = None,
) -> Vocabulary:
    """
    Function builds a PyText vocabulary for models pre-trained using Fairseq
    modules. The dictionary class can take any Fairseq Dictionary class
    and is used to load the vocab file.
    """
    dictionary = dictionary_class.load(vocab_file)
    # finalize will sort the dict based on frequency so only do this if
    # a min_count or max_vocab size is specified
    if min_count > 0 or max_vocab > 0:
        dictionary.finalize(threshold=min_count,
                            nwords=max_vocab,
                            padding_factor=1)
    if tokens_to_add:
        for token in tokens_to_add:
            dictionary.add_symbol(token)
    return Vocabulary(dictionary.symbols,
                      dictionary.count,
                      replacements=special_token_replacements)

コード例 #10

0

ファイルを表示

ファイル: bert_squad_qa.py プロジェクト: twwhatever/pytext

    def from_config(cls, config: Config, tensorizers):
        has_answer_labels = ["False", "True"]
        tensorizers["has_answer"].vocab = Vocabulary(has_answer_labels)
        vocab = tensorizers["squad_input"].vocab

        encoder = create_module(
            config.encoder,
            output_encoded_layers=True,
            padding_idx=vocab.get_pad_index(),
            vocab_size=vocab.__len__(),
        )

        pos_decoder = create_module(config.pos_decoder,
                                    in_dim=encoder.representation_dim,
                                    out_dim=2)
        has_ans_decoder = create_module(
            config.has_ans_decoder,
            in_dim=encoder.representation_dim,
            out_dim=len(has_answer_labels),
        )

        output_layer = create_module(config.output_layer,
                                     labels=has_answer_labels,
                                     is_kd=config.is_kd)

        return cls(encoder,
                   pos_decoder,
                   has_ans_decoder,
                   output_layer,
                   is_kd=config.is_kd)

コード例 #11

0

ファイルを表示

ファイル: output_layer_test.py プロジェクト: a-domingu/tbcnn

    def test_doc_classification_output_layer(self):
        tensorizer = LabelTensorizer()
        tensorizer.vocab = Vocabulary([SpecialTokens.PAD, "foo", "bar"])
        layer = ClassificationOutputLayer.from_config(
            config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()),
            labels=tensorizer.vocab,
        )
        self.assertEqual(layer.loss_fn.ignore_index, 0)

        # use default pad
        tensorizer.vocab = Vocabulary(["foo", "bar"])
        layer = ClassificationOutputLayer.from_config(
            config=ClassificationOutputLayer.Config(loss=CrossEntropyLoss.Config()),
            labels=tensorizer.vocab,
        )
        self.assertEqual(layer.loss_fn.ignore_index, -1)

コード例 #12

0

ファイルを表示

def build_fairseq_vocab(
    vocab_file: str,
    dictionary_class: Dictionary = Dictionary,
    special_token_replacements: Dict[str, SpecialToken] = None,
    max_vocab: int = -1,
    min_count: int = -1,
    tokens_to_add: Optional[List[str]] = None,
):
    """
    Function builds a PyText vocabulary for models pre-trained using Fairseq
    modules. The dictionary class can take any Fairseq Dictionary class
    and is used to load the vocab file.
    """
    if not special_token_replacements:
        special_token_replacements = {
            "<pad>": SpecialTokens.PAD,
            "<s>": SpecialTokens.BOS,
            "</s>": SpecialTokens.EOS,
            "<unk>": SpecialTokens.UNK,
            "<mask>": SpecialTokens.MASK,
        }
    with PathManager.open(vocab_file) as f:
        dictionary = dictionary_class.load(f)
        # finalize will sort the dict based on frequency so only do this if
        # a min_count or max_vocab size is specified
        if min_count > 0 or max_vocab > 0:
            dictionary.finalize(threshold=min_count, nwords=max_vocab, padding_factor=1)
        if tokens_to_add:
            for token in tokens_to_add:
                dictionary.add_symbol(token)
        return Vocabulary(
            dictionary.symbols,
            dictionary.count,
            replacements=special_token_replacements,
        )

コード例 #13

0

ファイルを表示

 def setUp(self):
     self.input_iterator = [
         {"text": "hello world"},
         {"text": "feeling lucky today"},
         {"text": "hello"},
         {"text": "lucky world"},
         {"text": "today world"},
     ]
     self.vocab = Vocabulary(["hello", "world", "feeling", "lucky", "today"])

コード例 #14

0

ファイルを表示

ファイル: transforms.py プロジェクト: freegliboracle/pytext

 def __init__(self, poss_slots: List[str], tokenizer: nn.Module = None):
     super().__init__()
     self.NO_LABEL = Token("NoLabel")
     poss_slots = list(poss_slots)
     if self.NO_LABEL not in poss_slots:
         poss_slots.insert(0, self.NO_LABEL)
     if SpecialTokens.PAD not in poss_slots:
         poss_slots.insert(1, SpecialTokens.PAD)
     self.vocab = Vocabulary(poss_slots)

コード例 #15

0

ファイルを表示

 def __init__(self, bpe, dictionary: Dictionary):
     self.bpe = bpe
     self.vocab = Vocabulary(
         dictionary.symbols,
         pad_token=str(dictionary[dictionary.pad()]),
         bos_token=str(dictionary[dictionary.bos()]),
         eos_token=str(dictionary[dictionary.eos()]),
     )
     self.bos = self.vocab.bos_token
     self.eos = self.vocab.eos_token

コード例 #16

0

ファイルを表示

ファイル: output_layer_test.py プロジェクト: a-domingu/tbcnn

 def test_create_word_tagging_output_layer(self):
     tensorizer = LabelTensorizer()
     tensorizer.vocab = Vocabulary(["foo", "bar"])
     tensorizer.pad_idx = 0
     layer = WordTaggingOutputLayer.from_config(
         config=WordTaggingOutputLayer.Config(label_weights={"foo": 2.2}),
         labels=tensorizer.vocab,
     )
     np.testing.assert_array_almost_equal(
         np.array([2.2, 1]), layer.loss_fn.weight.detach().numpy()
     )

コード例 #17

0

ファイルを表示

def build_dumb_slot_labelling_model():
    return build_slot_labelling_model(
        None,
        5,
        100,
        [10 for i in range(100)],
        0.4,
        False,
        None,
        None,
        5,
        Vocabulary([SpecialTokens.UNK, SpecialTokens.PAD, "the", "cat"]),
    )

コード例 #18

0

ファイルを表示

 def _build_vocab(self, vocab_file: str, max_vocab: int,
                  min_count: int) -> Vocabulary:
     """
     Build Vocab for XLM by calling the vocab reader associated with the model
     source.
     """
     if self.is_fairseq:
         vocab_list, counts, replacements = read_fairseq_vocab(
             vocab_file, max_vocab, min_count)
     else:
         vocab_list, counts, replacements = read_vocab(
             vocab_file, max_vocab, min_count)
     return Vocabulary(vocab_list, counts, replacements=replacements)

コード例 #19

0

ファイルを表示

def build_legacy_pytext_vocab_pipeline(vocab_file):
    from pytext.data.utils import Vocabulary

    tokenizer = get_tokenizer("basic_english")
    f = open(vocab_file, 'r')

    vocab_counter = Counter([token for line in f for token in line.rstrip()])
    sorted_by_freq_tuples = sorted(vocab_counter.items(), key=lambda x: x[1], reverse=True)
    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")

    pipeline = sequential_transforms(tokenizer_func(tokenizer),
                                     PyTextVocabTransform(Vocabulary(vocab_list, unk_token="<unk>")))
    return pipeline, None, None

コード例 #20

0

ファイルを表示

    def from_config(cls, config: Config, tensorizers: Dict[str, Tensorizer]):
        # Although the RNN params are configurable, for DrQA we want to set
        # the following parameters for all cases.
        config.ques_rnn.dropout = config.dropout
        config.doc_rnn.dropout = config.dropout

        embedding = cls.create_embedding(config, tensorizers)
        ques_aligned_doc_attn = SequenceAlignedAttention(
            embedding.embedding_dim)
        ques_rnn = create_module(config.ques_rnn,
                                 input_size=embedding.embedding_dim)
        doc_rnn = create_module(config.doc_rnn,
                                input_size=embedding.embedding_dim * 2)
        ques_self_attn = DotProductSelfAttention(ques_rnn.representation_dim)
        start_attn = MultiplicativeAttention(doc_rnn.representation_dim,
                                             ques_rnn.representation_dim,
                                             normalize=False)
        end_attn = MultiplicativeAttention(doc_rnn.representation_dim,
                                           ques_rnn.representation_dim,
                                           normalize=False)
        doc_rep_pool = SelfAttention(
            SelfAttention.Config(dropout=config.dropout),
            n_input=doc_rnn.representation_dim,
        )
        has_answer_labels = ["False", "True"]
        tensorizers["has_answer"].vocab = Vocabulary(has_answer_labels)
        has_ans_decoder = MLPDecoder(
            config=MLPDecoder.Config(),
            in_dim=doc_rnn.representation_dim,
            out_dim=len(has_answer_labels),
        )
        output_layer = create_module(config.output_layer,
                                     labels=has_answer_labels,
                                     is_kd=config.is_kd)
        return cls(
            dropout=nn.Dropout(config.dropout),
            embedding=embedding,
            ques_rnn=ques_rnn,
            doc_rnn=doc_rnn,
            ques_self_attn=ques_self_attn,
            ques_aligned_doc_attn=ques_aligned_doc_attn,
            start_attn=start_attn,
            end_attn=end_attn,
            doc_rep_pool=doc_rep_pool,
            has_ans_decoder=has_ans_decoder,
            output_layer=output_layer,
            is_kd=config.is_kd,
        )

コード例 #21

0

ファイルを表示

def build_dumb_intent_slot_model():
    return build_intent_joint_model(
        use_intent=False,
        loss_doc_weight=0.4,
        pretrain_embed=None,
        embed_dim=10,
        slot_kernel_num=10,
        slot_kernel_sizes=[10 for i in range(100)],
        doc_kernel_num=10,
        doc_kernel_sizes=[10 for i in range(100)],
        slot_bias=True,
        slot_decoder_hidden_dims=None,
        doc_bias=True,
        doc_decoder_hidden_dims=None,
        num_slots=26,
        num_intents=43,
        vocab=Vocabulary([SpecialTokens.UNK, SpecialTokens.PAD, "the", "cat"]),
        dropout=0.4,
        add_feat_len=0,
    )

コード例 #22

0

ファイルを表示

    def from_config(cls, config: Config, **kwargs):
        tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
        vocab = None
        if isinstance(tokenizer, WordPieceTokenizer):
            print("Using WordPieceTokenizer")
            replacements = {
                "[UNK]": UNK,
                "[PAD]": PAD,
                "[CLS]": BOS,
                "[SEP]": EOS,
                "[MASK]": MASK,
            }
            vocab = Vocabulary(
                [token for token, _ in tokenizer.vocab.items()],
                replacements=replacements,
            )

        doc_tensorizer = TokenTensorizer(
            text_column=config.doc_column,
            tokenizer=tokenizer,
            vocab=vocab,
            max_seq_len=config.max_doc_seq_len,
        )
        ques_tensorizer = TokenTensorizer(
            text_column=config.ques_column,
            tokenizer=tokenizer,
            vocab=vocab,
            max_seq_len=config.max_ques_seq_len,
        )
        return cls(
            doc_tensorizer=doc_tensorizer,
            ques_tensorizer=ques_tensorizer,
            doc_column=config.doc_column,
            ques_column=config.ques_column,
            answers_column=config.answers_column,
            answer_starts_column=config.answer_starts_column,
            tokenizer=tokenizer,
            vocab=vocab,
            **kwargs,
        )

コード例 #23

0

ファイルを表示

ファイル: tensorizers_test.py プロジェクト: theniteshsingh/pytext

    def test_lookup_tokens(self):
        text = "let's tokenize this"
        tokenizer = Tokenizer()
        vocab = Vocabulary(text.split() +
                           [SpecialTokens.BOS, SpecialTokens.EOS])
        tokens, start_idx, end_idx = lookup_tokens(text,
                                                   tokenizer=tokenizer,
                                                   vocab=vocab,
                                                   bos_token=None,
                                                   eos_token=None)
        self.assertEqual(tokens, [0, 1, 2])
        self.assertEqual(start_idx, (0, 6, 15))
        self.assertEqual(end_idx, (5, 14, 19))

        tokens, start_idx, end_idx = lookup_tokens(
            text,
            tokenizer=tokenizer,
            vocab=vocab,
            bos_token=SpecialTokens.BOS,
            eos_token=SpecialTokens.EOS,
        )
        self.assertEqual(tokens, [3, 0, 1, 2, 4])
        self.assertEqual(start_idx, (-1, 0, 6, 15, -1))
        self.assertEqual(end_idx, (-1, 5, 14, 19, -1))

コード例 #24

0

ファイルを表示

ファイル: text_model_exporter_test.py プロジェクト: whitespur/pytext

    def test_contextual_intent_slot_export_to_caffe2(self, test_num_words,
                                                     num_predictions,
                                                     test_num_seq):
        config = self._get_config(IntentSlotTask.Config,
                                  CONTEXTUAL_INTENT_SLOT_CONFIG)
        tensorizers, data = _NewTask._init_tensorizers(config)
        doc_labels = ["__UNKNOWN__", "cu:other", "cu:address_Person"]
        word_labels = ["__UNKNOWN__", "NoLabel", "person"]
        tensorizers["word_labels"].vocab = Vocabulary(word_labels)
        tensorizers["doc_labels"].vocab = Vocabulary(doc_labels)
        tensorizers["tokens"].vocab = Vocabulary(WORD_VOCAB)
        tensorizers["seq_tokens"].vocab = Vocabulary(WORD_VOCAB)
        py_model = _NewTask._init_model(config.model, tensorizers)
        dummy_test_input = self._get_rand_input_intent_slot(
            BATCH_SIZE, W_VOCAB_SIZE, test_num_words, test_num_seq)
        exporter = ModelExporter(
            ModelExporter.Config(),
            py_model.get_export_input_names(tensorizers),
            dummy_test_input,
            py_model.vocab_to_export(tensorizers),
            py_model.get_export_output_names(tensorizers),
        )

        with tempfile.NamedTemporaryFile(
                delete=False, suffix=".{}".format(".predictor")) as pred_file:
            print(pred_file.name)
            exporter.export_to_caffe2(py_model, pred_file.name)
            workspace.ResetWorkspace()

        pred_net = pe.prepare_prediction_net(pred_file.name, CAFFE2_DB_TYPE)
        for _i in range(num_predictions):
            test_inputs = self._get_rand_input_intent_slot(
                BATCH_SIZE, W_VOCAB_SIZE, test_num_words, test_num_seq)
            self._feed_c2_input(workspace, test_inputs, exporter.input_names,
                                exporter.vocab_map)
            workspace.RunNetOnce(pred_net)
            doc_output_names = [
                "{}:{}".format("doc_scores", class_name)
                for class_name in doc_labels
            ]
            word_output_names = [
                "{}:{}".format("word_scores", class_name)
                for class_name in word_labels
            ]
            py_model.eval()
            logits = py_model(*test_inputs)
            context = {SEQ_LENS: test_inputs[-1]}
            target = None
            (d_pred,
             w_pred), (d_score,
                       w_score) = py_model.get_pred(logits, target, context)

            c2_doc_out = []
            for o_name in doc_output_names:
                c2_doc_out.extend(list(workspace.FetchBlob(o_name)))
            c2_word_out = []
            for o_name in word_output_names:
                c2_word_out.extend(list(workspace.FetchBlob(o_name)))

            np.testing.assert_array_almost_equal(
                d_score.view(-1).detach().numpy(),
                np.array(c2_doc_out).flatten())

            np.testing.assert_array_almost_equal(
                torch.transpose(w_score, 1,
                                2).contiguous().view(-1).detach().numpy(),
                np.array(c2_word_out).flatten(),
            )

コード例 #25

0

ファイルを表示

 def __init__(self, label_names: List[str]):
     super().__init__()
     self.vocab = Vocabulary(label_names)

コード例 #26

0

ファイルを表示

 def __init__(self, label_names: List[str]):
     super().__init__()
     if SpecialTokens.UNK not in label_names:
         label_names.insert(0, SpecialTokens.UNK)
     self.vocab = Vocabulary(label_names)