Ejemplo n.º 1
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in ["A", "sentence"]],
                           {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in ["A", "sentence"]],
                           token_indexers={"words": SingleIdTokenIndexer(namespace="words"),
                                           "characters": TokenCharactersIndexer(namespace="characters")})
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
Ejemplo n.º 2
0
    def test_padding_lengths_are_computed_correctly(self):
        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5}

        field = TextField(["This", "is", "a", "sentence", "."],
                          token_indexers={
                              "characters":
                              TokenCharactersIndexer("characters")
                          })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}

        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers={
                "characters": TokenCharactersIndexer("characters"),
                "words": SingleIdTokenIndexer("words")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}
Ejemplo n.º 3
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence",
                                                      namespace="words")
        capital_a_index = vocab.add_token_to_namespace("A", namespace="words")
        capital_a_char_index = vocab.add_token_to_namespace(
            "A", namespace="characters")
        s_index = vocab.add_token_to_namespace("s", namespace="characters")
        e_index = vocab.add_token_to_namespace("e", namespace="characters")
        n_index = vocab.add_token_to_namespace("n", namespace="characters")
        t_index = vocab.add_token_to_namespace("t", namespace="characters")
        c_index = vocab.add_token_to_namespace("c", namespace="characters")

        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"words": SingleIdTokenIndexer(namespace="words")},
        )
        field.index(vocab)

        assert field._indexed_tokens["words"]["tokens"] == [
            capital_a_index, sentence_index
        ]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {
                "characters":
                TokenCharactersIndexer(namespace="characters",
                                       min_padding_length=1)
            },
        )
        field1.index(vocab)
        assert field1._indexed_tokens["characters"]["token_characters"] == [
            [capital_a_char_index],
            [
                s_index, e_index, n_index, t_index, e_index, n_index, c_index,
                e_index
            ],
        ]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words":
                SingleIdTokenIndexer(namespace="words"),
                "characters":
                TokenCharactersIndexer(namespace="characters",
                                       min_padding_length=1),
            },
        )
        field2.index(vocab)
        assert field2._indexed_tokens["words"]["tokens"] == [
            capital_a_index, sentence_index
        ]
        assert field2._indexed_tokens["characters"]["token_characters"] == [
            [capital_a_char_index],
            [
                s_index, e_index, n_index, t_index, e_index, n_index, c_index,
                e_index
            ],
        ]
Ejemplo n.º 4
0
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.tokenizer_space = WhitespaceTokenizer()
        self.tokenizer_spacy = SpacyTokenizer(language="en_core_web_md",
                                              pos_tags=True,
                                              split_on_spaces=True)
        self.token_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6),
            'pos_tags':
            SingleIdTokenIndexer(namespace='pos_tag_vocab',
                                 feature_name='tag_'),
            'ner_tags':
            SingleIdTokenIndexer(namespace='ner_tag_vocab',
                                 feature_name='ent_type_')
        }

        self.slot_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6)
        }
Ejemplo n.º 5
0
    def test_field_counts_vocab_items_correctly(self):
        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField(["This", "is", "a", "sentence", "."],
                          token_indexers={
                              "characters":
                              TokenCharactersIndexer("characters")
                          })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField(
            ["This", "is", "a", "sentence", "."],
            token_indexers={
                "words": SingleIdTokenIndexer("words"),
                "characters": TokenCharactersIndexer("characters")
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}
    def test_count_vocab_items_respects_casing(self):
        indexer = TokenCharactersIndexer("characters")
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["characters"] == {"h": 1, "H": 1, "e": 2, "l": 4, "o": 2}

        indexer = TokenCharactersIndexer("characters", CharacterTokenizer(lowercase_characters=True))
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["characters"] == {"h": 2, "e": 2, "l": 4, "o": 2}
Ejemplo n.º 7
0
    def test_field_counts_vocab_items_correctly(self):
        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts[u"words"][u"This"] == 1
        assert namespace_token_counts[u"words"][u"is"] == 1
        assert namespace_token_counts[u"words"][u"a"] == 1
        assert namespace_token_counts[u"words"][u"sentence"] == 1
        assert namespace_token_counts[u"words"][u"."] == 1
        assert list(namespace_token_counts.keys()) == [u"words"]

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"characters": TokenCharactersIndexer(u"characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts[u"characters"][u"T"] == 1
        assert namespace_token_counts[u"characters"][u"h"] == 1
        assert namespace_token_counts[u"characters"][u"i"] == 2
        assert namespace_token_counts[u"characters"][u"s"] == 3
        assert namespace_token_counts[u"characters"][u"a"] == 1
        assert namespace_token_counts[u"characters"][u"e"] == 3
        assert namespace_token_counts[u"characters"][u"n"] == 2
        assert namespace_token_counts[u"characters"][u"t"] == 1
        assert namespace_token_counts[u"characters"][u"c"] == 1
        assert namespace_token_counts[u"characters"][u"."] == 1
        assert list(namespace_token_counts.keys()) == [u"characters"]

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words"),
                                          u"characters": TokenCharactersIndexer(u"characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts[u"characters"][u"T"] == 1
        assert namespace_token_counts[u"characters"][u"h"] == 1
        assert namespace_token_counts[u"characters"][u"i"] == 2
        assert namespace_token_counts[u"characters"][u"s"] == 3
        assert namespace_token_counts[u"characters"][u"a"] == 1
        assert namespace_token_counts[u"characters"][u"e"] == 3
        assert namespace_token_counts[u"characters"][u"n"] == 2
        assert namespace_token_counts[u"characters"][u"t"] == 1
        assert namespace_token_counts[u"characters"][u"c"] == 1
        assert namespace_token_counts[u"characters"][u"."] == 1
        assert namespace_token_counts[u"words"][u"This"] == 1
        assert namespace_token_counts[u"words"][u"is"] == 1
        assert namespace_token_counts[u"words"][u"a"] == 1
        assert namespace_token_counts[u"words"][u"sentence"] == 1
        assert namespace_token_counts[u"words"][u"."] == 1
        assert set(namespace_token_counts.keys()) == set([u"words", u"characters"])
Ejemplo n.º 8
0
    def test_with_token_characters_indexer(self):

        inputs = {
            "premise": "I always write unit tests for my code.",
            "hypothesis": "One time I didn't write any unit tests for my code.",
        }

        archive = load_archive(
            self.FIXTURES_ROOT / "decomposable_attention" / "serialization" / "model.tar.gz"
        )
        predictor = Predictor.from_archive(archive, "textual-entailment")
        predictor._dataset_reader._token_indexers["chars"] = TokenCharactersIndexer(
            min_padding_length=1
        )
        predictor._model._text_field_embedder._token_embedders["chars"] = EmptyEmbedder()

        hotflipper = Hotflip(predictor)
        hotflipper.initialize()
        attack = hotflipper.attack_from_json(inputs, "hypothesis", "grad_input_1")
        assert attack is not None
        assert "final" in attack
        assert "original" in attack
        assert "outputs" in attack
        assert len(attack["final"][0]) == len(
            attack["original"]
        )  # hotflip replaces words without removing
Ejemplo n.º 9
0
    def test_min_padding_length(self):
        sentence = "AllenNLP is awesome ."
        tokens = [Token(token) for token in sentence.split(" ")]
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("l", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("N", namespace="characters")  # 6
        vocab.add_token_to_namespace("L", namespace="characters")  # 7
        vocab.add_token_to_namespace("P", namespace="characters")  # 8
        vocab.add_token_to_namespace("i", namespace="characters")  # 9
        vocab.add_token_to_namespace("s", namespace="characters")  # 10
        vocab.add_token_to_namespace("a", namespace="characters")  # 11
        vocab.add_token_to_namespace("w", namespace="characters")  # 12
        vocab.add_token_to_namespace("o", namespace="characters")  # 13
        vocab.add_token_to_namespace("m", namespace="characters")  # 14
        vocab.add_token_to_namespace(".", namespace="characters")  # 15

        indexer = TokenCharactersIndexer("characters", min_padding_length=10)
        indices = indexer.tokens_to_indices(tokens, vocab, "char")
        key_padding_lengths = "num_token_characters"
        value_padding_lengths = 0
        for token in indices["char"]:
            item = indexer.get_padding_lengths(token)
            value = item.values()
            value_padding_lengths = max(value_padding_lengths, max(value))
        padded = indexer.as_padded_tensor(
            indices, {"char": len(indices["char"])},
            {key_padding_lengths: value_padding_lengths})
        assert padded["char"].tolist() == [[2, 3, 3, 4, 5, 6, 7, 8, 0, 0],
                                           [9, 10, 0, 0, 0, 0, 0, 0, 0, 0],
                                           [11, 12, 4, 10, 13, 14, 4, 0, 0, 0],
                                           [15, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
Ejemplo n.º 10
0
 def test_token_indexer_returns_dict(self):
     field = TextField([Token(t) for t in ["A", "sentence"]],
                       token_indexers={"field_with_dict": DictReturningTokenIndexer(),
                                       "words": SingleIdTokenIndexer("words"),
                                       "characters": TokenCharactersIndexer("characters",
                                                                            min_padding_length=1)})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
             'token_ids_length': 5,
             'additional_key_length': 2,
             'words_length': 2,
             'characters_length': 2,
             'num_token_characters': 8,
             'num_tokens': 5,
     }
     padding_lengths['token_ids_length'] = 7
     padding_lengths['additional_key_length'] = 3
     padding_lengths['words_length'] = 4
     padding_lengths['characters_length'] = 4
     tensors = field.as_tensor(padding_lengths)
     assert list(tensors['token_ids'].shape) == [7]
     assert list(tensors['additional_key'].shape) == [3]
     assert list(tensors['words'].shape) == [4]
     assert list(tensors['characters'].shape) == [4, 8]
Ejemplo n.º 11
0
def main():
    dataset_reader = CopyNetDatasetReader(
        target_namespace='target_tokens',
        source_token_indexers={
            'tokens': SingleIdTokenIndexer(namespace='source_tokens'),
            'token_characters': TokenCharactersIndexer()
        })

    model_archive = load_archive(
        archive_file='checkpoints/model.tar.gz',
        cuda_device=-1,
        weights_file='checkpoints/model_state_epoch_28.th')

    model = model_archive.model
    model.eval()

    predictor = Seq2SeqPredictor(model=model, dataset_reader=dataset_reader)

    val_file = open('snips/val.tsv')
    for line in val_file:
        source, target = line.strip().split('\t')
        print('Gold Target: {}'.format(
            target.replace('OPEN', '(').replace('CLOSE', ')')))
        predicted_tokens = predictor.predict(target)['predicted_tokens'][0]
        print('Predictions: {}'.format(' '.join(predicted_tokens)).replace(
            'OPEN', '(').replace('CLOSE', ')') + '\n')
Ejemplo n.º 12
0
def build_indexers(args):
    indexers = {}
    if not args.word_embs == "none":
        indexers["words"] = SingleIdTokenIndexer()
    if args.elmo:
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}
    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}")
    if args.openai_transformer:
        assert not indexers, ("OpenAI transformer is not supported alongside"
                              " other indexers due to tokenization!")
        assert args.tokenizer == "OpenAI.BPE", (
            "OpenAI transformer is not supported alongside"
            " other indexers due to tokenization!")
        indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer(
            "openai_bpe")
    if args.bert_model_name:
        assert not indexers, ("BERT is not supported alongside"
                              " other indexers due to tokenization!")
        assert args.tokenizer == args.bert_model_name, (
            "BERT models use custom WPM tokenization for "
            "each model, so tokenizer must match the "
            "specified BERT model.")
        indexers["bert_wpm_pretokenized"] = SingleIdTokenIndexer(
            args.bert_model_name)
    return indexers
Ejemplo n.º 13
0
    def test_min_padding_length(self):
        sentence = "AllenNLP is awesome ."
        tokens = [Token(token) for token in sentence.split(" ")]
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("l", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("N", namespace="characters")  # 6
        vocab.add_token_to_namespace("L", namespace="characters")  # 7
        vocab.add_token_to_namespace("P", namespace="characters")  # 8
        vocab.add_token_to_namespace("i", namespace="characters")  # 9
        vocab.add_token_to_namespace("s", namespace="characters")  # 10
        vocab.add_token_to_namespace("a", namespace="characters")  # 11
        vocab.add_token_to_namespace("w", namespace="characters")  # 12
        vocab.add_token_to_namespace("o", namespace="characters")  # 13
        vocab.add_token_to_namespace("m", namespace="characters")  # 14
        vocab.add_token_to_namespace(".", namespace="characters")  # 15

        indexer = TokenCharactersIndexer("characters", min_padding_length=10)
        indices = indexer.tokens_to_indices(tokens, vocab)
        padded = indexer.as_padded_tensor_dict(
            indices, indexer.get_padding_lengths(indices))
        assert padded["token_characters"].tolist() == [
            [2, 3, 3, 4, 5, 6, 7, 8, 0, 0],
            [9, 10, 0, 0, 0, 0, 0, 0, 0, 0],
            [11, 12, 4, 10, 13, 14, 4, 0, 0, 0],
            [15, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        ]
Ejemplo n.º 14
0
def build_indexers(args):
    indexers = {}
    if not args.input_module.startswith("bert") and args.input_module not in [
            "elmo", "gpt"
    ]:
        indexers["words"] = SingleIdTokenIndexer()
    if args.input_module == "elmo":
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}
    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}")
    if args.input_module == "gpt":
        assert (
            not indexers
        ), "OpenAI transformer is not supported alongside other indexers due to tokenization."
        assert (
            args.tokenizer == "OpenAI.BPE"
        ), "OpenAI transformer uses custom BPE tokenization. Set tokenizer=OpenAI.BPE."
        indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer(
            "openai_bpe")
    if args.input_module.startswith("bert"):
        assert not indexers, "BERT is not supported alongside other indexers due to tokenization."
        assert args.tokenizer == args.input_module, (
            "BERT models use custom WPM tokenization for "
            "each model, so tokenizer must match the "
            "specified BERT model.")
        indexers["bert_wpm_pretokenized"] = SingleIdTokenIndexer(
            args.input_module)
    return indexers
Ejemplo n.º 15
0
    def test_with_token_characters_indexer(self):

        inputs = {"sentence": "I always write unit tests for my code."}

        archive = load_archive(self.FIXTURES_ROOT / "basic_classifier" /
                               "serialization" / "model.tar.gz")
        predictor = Predictor.from_archive(archive)
        predictor._dataset_reader._token_indexers[
            "chars"] = TokenCharactersIndexer(min_padding_length=1)
        predictor._model._text_field_embedder._token_embedders[
            "chars"] = EmptyEmbedder()

        hotflipper = Hotflip(predictor)
        hotflipper.initialize()
        attack = hotflipper.attack_from_json(inputs, "tokens", "grad_input_1")
        assert attack is not None
        assert "final" in attack
        assert "original" in attack
        assert "outputs" in attack
        assert len(attack["final"][0]) == len(
            attack["original"])  # hotflip replaces words without removing

        # This checks for a bug that arose with a change in the pytorch API.  We want to be sure we
        # can handle the case where we have to re-encode a vocab item because we didn't save it in
        # our fake embedding matrix (see Hotflip docstring for more info).
        hotflipper = Hotflip(predictor, max_tokens=50)
        hotflipper.initialize()
        hotflipper._first_order_taylor(grad=torch.rand((10, )).numpy(),
                                       token_idx=torch.tensor(60),
                                       sign=1)
Ejemplo n.º 16
0
def build_indexers(args):
    indexers = {}
    if args.input_module in ["scratch", "glove", "fastText"]:
        indexers["words"] = SingleIdTokenIndexer()
    elif args.input_module in ["elmo", "elmo-chars-only"]:
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}

    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}")

    if input_module_uses_transformers(args.input_module):
        assert (
            not indexers
        ), "transformers modules like BERT/XLNet are not supported alongside other "
        "indexers due to tokenization."
        assert args.tokenizer == args.input_module, (
            "transformers models use custom tokenization for each model, so tokenizer "
            "must match the specified model.")
        tokenizer_name = input_module_tokenizer_name(args.input_module)
        indexers[tokenizer_name] = SingleIdTokenIndexer(tokenizer_name)
    return indexers
Ejemplo n.º 17
0
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')
        for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']:
            self.vocab.add_token_to_namespace(label, 'labels')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"),
                                              "characters": TokenCharactersIndexer("characters")}
        self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]],
                                self.word_indexer)
        self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]],
                                self.word_indexer)
        self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]],
                                self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field()

        super(TestListField, self).setUp()
Ejemplo n.º 18
0
def create_save_vocab(file_path, target_dir, word_min_count, char_min_count):
    namespace_word = "word2idx"
    namespace_char = "char2idx"
    token_indexers = {
        "tokens": SingleIdTokenIndexer(namespace=namespace_word),
        "chars": TokenCharactersIndexer(namespace=namespace_char)
    }
    min_count = {
        namespace_word: word_min_count,
        namespace_char: char_min_count
    }

    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    vocab = Vocabulary.from_instances(instances, min_count=min_count)
    word_cnt = vocab.get_vocab_size(namespace_word)
    char_cnt = vocab.get_vocab_size(namespace_char)
    vocab.save_to_files(target_dir)
    print("save word2idx={}, char2idx={} to {}".format(word_cnt, char_cnt,
                                                       target_dir))
    word2idx = vocab.get_index_to_token_vocabulary(namespace_word)
    char2idx = vocab.get_index_to_token_vocabulary(namespace_char)
    print(char2idx)
    vocab = Vocabulary.from_files(target_dir)
    char2idx = vocab.get_index_to_token_vocabulary(namespace_char)
    print(char2idx)
    return
Ejemplo n.º 19
0
def read_squad_word_char(file_path):
    token_indexers = {
        "tokens": SingleIdTokenIndexer(namespace="token_ids"),
        "chars": TokenCharactersIndexer(namespace="token_chars")
    }
    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    vocab = Vocabulary.from_instances(instances)
    word2idx = vocab.get_index_to_token_vocabulary("token_ids")
    char2idx = vocab.get_index_to_token_vocabulary("token_chars")
    #print (word2idx)
    print(len(word2idx))
    print(len(char2idx))
    print(char2idx)
    batch = Batch(instances)
    batch.index_instances(vocab)
    padding_lengths = batch.get_padding_lengths()
    print(padding_lengths)
    tensor_dict = batch.as_tensor_dict(padding_lengths)
    print(tensor_dict['passage']['tokens'].shape)
    print(tensor_dict['passage']['chars'].shape)
    print(tensor_dict['question']['tokens'].shape)
    print(tensor_dict['question']['chars'].shape)
    print(tensor_dict['span_start'].shape)
    print(tensor_dict['span_end'].shape)
Ejemplo n.º 20
0
 def test_token_padding_lengths_are_computed_correctly(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(token_min_padding_length=3),
             "words":
             SingleIdTokenIndexer("words", token_min_padding_length=3),
             "characters":
             TokenCharactersIndexer("characters",
                                    min_padding_length=1,
                                    token_min_padding_length=3),
         },
     )
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         "field_with_dict___token_ids": 5,
         "field_with_dict___additional_key": 3,
         "words___tokens": 3,
         "characters___token_characters": 3,
         "characters___num_token_characters": 8,
     }
     tensors = field.as_tensor(padding_lengths)
     assert tensors["field_with_dict"]["additional_key"].tolist()[-1] == 0
     assert tensors["words"]["tokens"].tolist()[-1] == 0
     assert tensors["characters"]["token_characters"].tolist()[-1] == [0
                                                                       ] * 8
Ejemplo n.º 21
0
    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharactersIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words_length"] = 5
        padding_lengths["characters_length"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].detach().cpu().numpy(),
            numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
Ejemplo n.º 22
0
 def test_token_indexer_returns_dict(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(),
             "words":
             SingleIdTokenIndexer("words"),
             "characters":
             TokenCharactersIndexer("characters", min_padding_length=1),
         },
     )
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         "field_with_dict___token_ids": 5,
         "field_with_dict___additional_key": 2,
         "words___tokens": 2,
         "characters___token_characters": 2,
         "characters___num_token_characters": 8,
     }
     padding_lengths["field_with_dict___token_ids"] = 7
     padding_lengths["field_with_dict___additional_key"] = 3
     padding_lengths["words___tokens"] = 4
     padding_lengths["characters___token_characters"] = 4
     tensors = field.as_tensor(padding_lengths)
     assert list(tensors["field_with_dict"]["token_ids"].shape) == [7]
     assert list(tensors["field_with_dict"]["additional_key"].shape) == [3]
     assert list(tensors["words"]["tokens"].shape) == [4]
     assert list(tensors["characters"]["token_characters"].shape) == [4, 8]
Ejemplo n.º 23
0
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 name_token_indexers: Dict[str, TokenIndexer] = None,
                 token_only_indexer: Dict[str, TokenIndexer] = None) -> None:
        self._name_token_indexers = name_token_indexers or \
                                    {'tokens': SingleIdTokenIndexer(namespace="tokens"),
                                     'token_characters': TokenCharactersIndexer(namespace="token_characters")}
        self._token_only_indexer = token_only_indexer or \
                                   {'tokens': SingleIdTokenIndexer(namespace="tokens")}
        self._tokenizer = tokenizer or WordTokenizer()

        self._empty_token_text_field = TextField(
            self._tokenizer.tokenize('00000'), self._token_only_indexer)
        self._empty_list_token_text_field = ListField([
            TextField(self._tokenizer.tokenize('00000'),
                      self._token_only_indexer)
        ])

        self.PARENT_REL_LABELS = constants.UMLS_PARENT_REL_LABELS
        self.CHILD_REL_LABELS = constants.UMLS_CHILD_REL_LABELS

        self.STOP = set(stopwords.words('english'))
        self.tokenizer = RegexpTokenizer(r'[A-Za-z\d]+')
        self.stemmer = SnowballStemmer("english")
        self.lemmatizer = WordNetLemmatizer()

        self.nlp = spacy.load('en')
Ejemplo n.º 24
0
 def test_token_padding_lengths_are_computed_correctly(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(token_min_padding_length=3),
             "words":
             SingleIdTokenIndexer("words", token_min_padding_length=3),
             "characters":
             TokenCharactersIndexer("characters",
                                    min_padding_length=1,
                                    token_min_padding_length=3)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         'token_ids_length': 5,
         'additional_key_length': 3,
         'words_length': 3,
         'characters_length': 3,
         'num_token_characters': 8,
         'num_tokens': 5,
     }
     tensors = field.as_tensor(padding_lengths)
     assert tensors['additional_key'].tolist()[-1] == 0
     assert tensors['words'].tolist()[-1] == 0
     assert tensors['characters'].tolist()[-1] == [0] * 8
Ejemplo n.º 25
0
    def from_params(cls, params: Params) -> "PnetOntoDatasetReader":
        # token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
        token_indexers = {
            "tokens": SingleIdTokenIndexer(lowercase_tokens=True),
            "token_characters": TokenCharactersIndexer(),
            "elmo": ELMoTokenCharactersIndexer(),
        }
        valid_class = params.pop("valid_class")
        random_seed = params.pop("random_seed")
        drop_empty = params.pop("drop_empty")
        valid_part = params.pop("valid_part")

        tag_label = params.pop("tag_label", None)
        feature_labels = params.pop("feature_labels", ())
        lazy = params.pop("lazy", False)
        params.assert_empty(cls.__name__)
        return PnetOntoDatasetReader(
            token_indexers=token_indexers,
            valid_class=valid_class,
            random_seed=random_seed,
            drop_empty=drop_empty,
            valid_part=valid_part,
            tag_label=tag_label,
            feature_labels=feature_labels,
            lazy=lazy,
        )
Ejemplo n.º 26
0
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexer = {
            "words": SingleIdTokenIndexer("words"),
            "characters": TokenCharactersIndexer("characters")
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)
        super(TestListField, self).setUp()
Ejemplo n.º 27
0
def read_squad_allennlp(file_path):
    '''read data, build vocab, batch, padding, to idx
    Args:
        file_path -- raw squad json file
    Returns:
        None
    '''
    token_indexers = {
            "tokens": SingleIdTokenIndexer(namespace="token_ids"),
            "chars": TokenCharactersIndexer(namespace="token_chars")}
    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    for instance in instances:
        question = instance.fields['question']
        print (question)
        print (type(question))
        break
    vocab = Vocabulary.from_instances(instances)
    word2idx = vocab.get_index_to_token_vocabulary("token_ids")
    char2idx = vocab.get_index_to_token_vocabulary("token_chars")
    #print (word2idx)
    print (len(word2idx))
    print (len(char2idx))
    print (char2idx)
    batch = Batch(instances)
    batch.index_instances(vocab)
    padding_lengths = batch.get_padding_lengths()
    print (padding_lengths)
    tensor_dict = batch.as_tensor_dict(padding_lengths)
    print (tensor_dict['passage']['tokens'].shape)
    print (tensor_dict['passage']['chars'].shape)
    print (tensor_dict['question']['tokens'].shape)
    print (tensor_dict['question']['chars'].shape)
    print (tensor_dict['span_start'].shape)
    print (tensor_dict['span_end'].shape)
 def test_as_array_produces_token_sequence(self):
     indexer = TokenCharactersIndexer("characters")
     padded_tokens = indexer.pad_token_sequence([[1, 2, 3, 4, 5], [1, 2, 3], [1]],
                                                desired_num_tokens=4,
                                                padding_lengths={"num_token_characters": 10})
     assert padded_tokens == [[1, 2, 3, 4, 5, 0, 0, 0, 0, 0],
                              [1, 2, 3, 0, 0, 0, 0, 0, 0, 0],
                              [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
Ejemplo n.º 29
0
 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or CharacterTokenizer()
     self._token_indexers = token_indexers or {
         'tokens': TokenCharactersIndexer()
     }
Ejemplo n.º 30
0
 def test_as_tensor_handles_characters_if_empty_field(self):
     field = TextField([], token_indexers={"characters": TokenCharactersIndexer("characters",
                                                                                min_padding_length=1)})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     expected_character_array = numpy.array([])
     numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(),
                                             expected_character_array)