def test_min_padding_length(self):
        sentence = "AllenNLP is awesome ."
        tokens = [Token(token) for token in sentence.split(" ")]
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("l", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("N", namespace="characters")  # 6
        vocab.add_token_to_namespace("L", namespace="characters")  # 7
        vocab.add_token_to_namespace("P", namespace="characters")  # 8
        vocab.add_token_to_namespace("i", namespace="characters")  # 9
        vocab.add_token_to_namespace("s", namespace="characters")  # 10
        vocab.add_token_to_namespace("a", namespace="characters")  # 11
        vocab.add_token_to_namespace("w", namespace="characters")  # 12
        vocab.add_token_to_namespace("o", namespace="characters")  # 13
        vocab.add_token_to_namespace("m", namespace="characters")  # 14
        vocab.add_token_to_namespace(".", namespace="characters")  # 15

        indexer = TokenCharactersIndexer("characters", min_padding_length=10)
        indices = indexer.tokens_to_indices(tokens, vocab, "char")
        key_padding_lengths = "num_token_characters"
        value_padding_lengths = 0
        for token in indices["char"]:
            item = indexer.get_padding_lengths(token)
            value = item.values()
            value_padding_lengths = max(value_padding_lengths, max(value))
        padded = indexer.pad_token_sequence(indices,
                                            {"char": len(indices["char"])},
                                            {key_padding_lengths: value_padding_lengths})
        assert padded == {"char": [[2, 3, 3, 4, 5, 6, 7, 8, 0, 0],
                                   [9, 10, 0, 0, 0, 0, 0, 0, 0, 0],
                                   [11, 12, 4, 10, 13, 14, 4, 0, 0, 0],
                                   [15, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}
 def test_as_array_produces_token_sequence(self):
     indexer = TokenCharactersIndexer("characters")
     padded_tokens = indexer.pad_token_sequence([[1, 2, 3, 4, 5], [1, 2, 3], [1]],
                                                desired_num_tokens=4,
                                                padding_lengths={"num_token_characters": 10})
     assert padded_tokens == [[1, 2, 3, 4, 5, 0, 0, 0, 0, 0],
                              [1, 2, 3, 0, 0, 0, 0, 0, 0, 0],
                              [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    def test_token_to_indices_produces_correct_characters(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace='characters')
        vocab.add_token_to_namespace("s", namespace='characters')
        vocab.add_token_to_namespace("e", namespace='characters')
        vocab.add_token_to_namespace("n", namespace='characters')
        vocab.add_token_to_namespace("t", namespace='characters')
        vocab.add_token_to_namespace("c", namespace='characters')

        indexer = TokenCharactersIndexer("characters")
        indices = indexer.token_to_indices(Token("sentential"), vocab)
        assert indices == [3, 4, 5, 6, 4, 5, 6, 1, 1, 1]
Beispiel #4
0
 def test_token_padding_lengths_are_computed_correctly(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(token_min_padding_length=3),
             "words":
             SingleIdTokenIndexer("words", token_min_padding_length=3),
             "characters":
             TokenCharactersIndexer("characters",
                                    min_padding_length=1,
                                    token_min_padding_length=3)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         'token_ids_length': 5,
         'additional_key_length': 3,
         'words_length': 3,
         'characters_length': 3,
         'num_token_characters': 8,
         'num_tokens': 5,
     }
     tensors = field.as_tensor(padding_lengths)
     assert tensors['additional_key'].tolist()[-1] == 0
     assert tensors['words'].tolist()[-1] == 0
     assert tensors['characters'].tolist()[-1] == [0] * 8
Beispiel #5
0
 def test_token_indexer_returns_dict(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(),
             "words":
             SingleIdTokenIndexer("words"),
             "characters":
             TokenCharactersIndexer("characters", min_padding_length=1)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         'token_ids_length': 5,
         'additional_key_length': 2,
         'words_length': 2,
         'characters_length': 2,
         'num_token_characters': 8,
         'num_tokens': 5,
     }
     padding_lengths['token_ids_length'] = 7
     padding_lengths['additional_key_length'] = 3
     padding_lengths['words_length'] = 4
     padding_lengths['characters_length'] = 4
     tensors = field.as_tensor(padding_lengths)
     assert list(tensors['token_ids'].shape) == [7]
     assert list(tensors['additional_key'].shape) == [3]
     assert list(tensors['words'].shape) == [4]
     assert list(tensors['characters'].shape) == [4, 8]
Beispiel #6
0
    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharactersIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words_length"] = 5
        padding_lengths["characters_length"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].detach().cpu().numpy(),
            numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
def main():
    dataset_reader = CopyNetDatasetReader(
        target_namespace='target_tokens',
        source_token_indexers={
            'tokens': SingleIdTokenIndexer(namespace='source_tokens'),
            'token_characters': TokenCharactersIndexer()
        })

    model_archive = load_archive(
        archive_file='checkpoints/model.tar.gz',
        cuda_device=-1,
        weights_file='checkpoints/model_state_epoch_28.th')

    model = model_archive.model
    model.eval()

    predictor = Seq2SeqPredictor(model=model, dataset_reader=dataset_reader)

    val_file = open('snips/val.tsv')
    for line in val_file:
        source, target = line.strip().split('\t')
        print('Gold Target: {}'.format(
            target.replace('OPEN', '(').replace('CLOSE', ')')))
        predicted_tokens = predictor.predict(target)['predicted_tokens'][0]
        print('Predictions: {}'.format(' '.join(predicted_tokens)).replace(
            'OPEN', '(').replace('CLOSE', ')') + '\n')
Beispiel #8
0
 def test_token_padding_lengths_are_computed_correctly(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(token_min_padding_length=3),
             "words":
             SingleIdTokenIndexer("words", token_min_padding_length=3),
             "characters":
             TokenCharactersIndexer("characters",
                                    min_padding_length=1,
                                    token_min_padding_length=3),
         },
     )
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         "field_with_dict___token_ids": 5,
         "field_with_dict___additional_key": 3,
         "words___tokens": 3,
         "characters___token_characters": 3,
         "characters___num_token_characters": 8,
     }
     tensors = field.as_tensor(padding_lengths)
     assert tensors["field_with_dict"]["additional_key"].tolist()[-1] == 0
     assert tensors["words"]["tokens"].tolist()[-1] == 0
     assert tensors["characters"]["token_characters"].tolist()[-1] == [0
                                                                       ] * 8
Beispiel #9
0
def build_indexers(args):
    indexers = {}
    if not args.word_embs == 'none':
        indexers["words"] = SingleIdTokenIndexer()
    if args.elmo:
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}
    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", \
            (f"CoVe model expects Moses tokenization (MosesTokenizer);"
             " you are using args.tokenizer = {args.tokenizer}")
    if args.openai_transformer:
        assert not indexers, ("OpenAI transformer is not supported alongside"
                              " other indexers due to tokenization!")
        assert args.tokenizer == "OpenAI.BPE", \
            ("OpenAI transformer is not supported alongside"
             " other indexers due to tokenization!")
        indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer(
            "openai_bpe")
    if args.bert_model_name:
        assert not indexers, ("BERT is not supported alongside"
                              " other indexers due to tokenization!")
        assert args.tokenizer == args.bert_model_name, \
            ("BERT models use custom WPM tokenization for "
             "each model, so tokenizer must match the "
             "specified BERT model.")
        indexers["bert_wpm_pretokenized"] = SingleIdTokenIndexer(
            args.bert_model_name)
    return indexers
Beispiel #10
0
 def test_token_indexer_returns_dict(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(),
             "words":
             SingleIdTokenIndexer("words"),
             "characters":
             TokenCharactersIndexer("characters", min_padding_length=1),
         },
     )
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         "field_with_dict___token_ids": 5,
         "field_with_dict___additional_key": 2,
         "words___tokens": 2,
         "characters___token_characters": 2,
         "characters___num_token_characters": 8,
     }
     padding_lengths["field_with_dict___token_ids"] = 7
     padding_lengths["field_with_dict___additional_key"] = 3
     padding_lengths["words___tokens"] = 4
     padding_lengths["characters___token_characters"] = 4
     tensors = field.as_tensor(padding_lengths)
     assert list(tensors["field_with_dict"]["token_ids"].shape) == [7]
     assert list(tensors["field_with_dict"]["additional_key"].shape) == [3]
     assert list(tensors["words"]["tokens"].shape) == [4]
     assert list(tensors["characters"]["token_characters"].shape) == [4, 8]
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace(u"this", u"words")
        self.vocab.add_token_to_namespace(u"is", u"words")
        self.vocab.add_token_to_namespace(u"a", u"words")
        self.vocab.add_token_to_namespace(u"sentence", u'words')
        self.vocab.add_token_to_namespace(u"s", u'characters')
        self.vocab.add_token_to_namespace(u"e", u'characters')
        self.vocab.add_token_to_namespace(u"n", u'characters')
        self.vocab.add_token_to_namespace(u"t", u'characters')
        self.vocab.add_token_to_namespace(u"c", u'characters')
        for label in [u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', u'k']:
            self.vocab.add_token_to_namespace(label, u'labels')

        self.word_indexer = {u"words": SingleIdTokenIndexer(u"words")}
        self.words_and_characters_indexers = {u"words": SingleIdTokenIndexer(u"words"),
                                              u"characters": TokenCharactersIndexer(u"characters")}
        self.field1 = TextField([Token(t) for t in [u"this", u"is", u"a", u"sentence"]],
                                self.word_indexer)
        self.field2 = TextField([Token(t) for t in [u"this", u"is", u"a", u"different", u"sentence"]],
                                self.word_indexer)
        self.field3 = TextField([Token(t) for t in [u"this", u"is", u"another", u"sentence"]],
                                self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field()

        super(TestListField, self).setUp()
Beispiel #12
0
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexer = {
            "words": SingleIdTokenIndexer("words"),
            "characters": TokenCharactersIndexer("characters")
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)
        super(TestListField, self).setUp()
Beispiel #13
0
    def test_with_token_characters_indexer(self):

        inputs = {"sentence": "I always write unit tests for my code."}

        archive = load_archive(self.FIXTURES_ROOT / "basic_classifier" /
                               "serialization" / "model.tar.gz")
        predictor = Predictor.from_archive(archive)
        predictor._dataset_reader._token_indexers[
            "chars"] = TokenCharactersIndexer(min_padding_length=1)
        predictor._model._text_field_embedder._token_embedders[
            "chars"] = EmptyEmbedder()

        hotflipper = Hotflip(predictor)
        hotflipper.initialize()
        attack = hotflipper.attack_from_json(inputs, "tokens", "grad_input_1")
        assert attack is not None
        assert "final" in attack
        assert "original" in attack
        assert "outputs" in attack
        assert len(attack["final"][0]) == len(
            attack["original"])  # hotflip replaces words without removing

        # This checks for a bug that arose with a change in the pytorch API.  We want to be sure we
        # can handle the case where we have to re-encode a vocab item because we didn't save it in
        # our fake embedding matrix (see Hotflip docstring for more info).
        hotflipper = Hotflip(predictor, max_tokens=50)
        hotflipper.initialize()
        hotflipper._first_order_taylor(grad=torch.rand((10, )).numpy(),
                                       token_idx=torch.tensor(60),
                                       sign=1)
Beispiel #14
0
def build_indexers(args):
    indexers = {}
    if args.input_module in ["scratch", "glove", "fastText"]:
        indexers["words"] = SingleIdTokenIndexer()
    elif args.input_module in ["elmo", "elmo-chars-only"]:
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}

    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}")

    if input_module_uses_transformers(args.input_module):
        assert (
            not indexers
        ), "transformers modules like BERT/XLNet are not supported alongside other "
        "indexers due to tokenization."
        assert args.tokenizer == args.input_module, (
            "transformers models use custom tokenization for each model, so tokenizer "
            "must match the specified model.")
        tokenizer_name = input_module_tokenizer_name(args.input_module)
        indexers[tokenizer_name] = SingleIdTokenIndexer(tokenizer_name)
    return indexers
Beispiel #15
0
    def from_params(cls, params: Params) -> "PnetOntoDatasetReader":
        # token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
        token_indexers = {
            "tokens": SingleIdTokenIndexer(lowercase_tokens=True),
            "token_characters": TokenCharactersIndexer(),
            "elmo": ELMoTokenCharactersIndexer(),
        }
        valid_class = params.pop("valid_class")
        random_seed = params.pop("random_seed")
        drop_empty = params.pop("drop_empty")
        valid_part = params.pop("valid_part")

        tag_label = params.pop("tag_label", None)
        feature_labels = params.pop("feature_labels", ())
        lazy = params.pop("lazy", False)
        params.assert_empty(cls.__name__)
        return PnetOntoDatasetReader(
            token_indexers=token_indexers,
            valid_class=valid_class,
            random_seed=random_seed,
            drop_empty=drop_empty,
            valid_part=valid_part,
            tag_label=tag_label,
            feature_labels=feature_labels,
            lazy=lazy,
        )
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 name_token_indexers: Dict[str, TokenIndexer] = None,
                 token_only_indexer: Dict[str, TokenIndexer] = None) -> None:
        self._name_token_indexers = name_token_indexers or \
                                    {'tokens': SingleIdTokenIndexer(namespace="tokens"),
                                     'token_characters': TokenCharactersIndexer(namespace="token_characters")}
        self._token_only_indexer = token_only_indexer or \
                                   {'tokens': SingleIdTokenIndexer(namespace="tokens")}
        self._tokenizer = tokenizer or WordTokenizer()

        self._empty_token_text_field = TextField(
            self._tokenizer.tokenize('00000'), self._token_only_indexer)
        self._empty_list_token_text_field = ListField([
            TextField(self._tokenizer.tokenize('00000'),
                      self._token_only_indexer)
        ])

        self.PARENT_REL_LABELS = constants.UMLS_PARENT_REL_LABELS
        self.CHILD_REL_LABELS = constants.UMLS_CHILD_REL_LABELS

        self.STOP = set(stopwords.words('english'))
        self.tokenizer = RegexpTokenizer(r'[A-Za-z\d]+')
        self.stemmer = SnowballStemmer("english")
        self.lemmatizer = WordNetLemmatizer()

        self.nlp = spacy.load('en')
Beispiel #17
0
def build_indexers(args):
    indexers = {}
    if not args.input_module.startswith("bert") and args.input_module not in [
            "elmo", "gpt"
    ]:
        indexers["words"] = SingleIdTokenIndexer()
    if args.input_module == "elmo":
        indexers["elmo"] = ELMoTokenCharactersIndexer("elmo")
        assert args.tokenizer in {"", "MosesTokenizer"}
    if args.char_embs:
        indexers["chars"] = TokenCharactersIndexer("chars")
    if args.cove:
        assert args.tokenizer == "MosesTokenizer", (
            f"CoVe model expects Moses tokenization (MosesTokenizer);"
            " you are using args.tokenizer = {args.tokenizer}")
    if args.input_module == "gpt":
        assert (
            not indexers
        ), "OpenAI transformer is not supported alongside other indexers due to tokenization."
        assert (
            args.tokenizer == "OpenAI.BPE"
        ), "OpenAI transformer uses custom BPE tokenization. Set tokenizer=OpenAI.BPE."
        indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer(
            "openai_bpe")
    if args.input_module.startswith("bert"):
        assert not indexers, "BERT is not supported alongside other indexers due to tokenization."
        assert args.tokenizer == args.input_module, (
            "BERT models use custom WPM tokenization for "
            "each model, so tokenizer must match the "
            "specified BERT model.")
        indexers["bert_wpm_pretokenized"] = SingleIdTokenIndexer(
            args.input_module)
    return indexers
    def test_start_and_end_tokens(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace='characters')  # 2
        vocab.add_token_to_namespace("s", namespace='characters')  # 3
        vocab.add_token_to_namespace("e", namespace='characters')  # 4
        vocab.add_token_to_namespace("n", namespace='characters')  # 5
        vocab.add_token_to_namespace("t", namespace='characters')  # 6
        vocab.add_token_to_namespace("c", namespace='characters')  # 7
        vocab.add_token_to_namespace("<", namespace='characters')  # 8
        vocab.add_token_to_namespace(">", namespace='characters')  # 9
        vocab.add_token_to_namespace("/", namespace='characters')  # 10

        indexer = TokenCharactersIndexer("characters", start_tokens=["<s>"], end_tokens=["</s>"])
        indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char")
        assert indices == {"char": [[8, 3, 9],
                                    [3, 4, 5, 6, 4, 5, 6, 1, 1, 1],
                                    [8, 10, 3, 9]]}
Beispiel #19
0
 def test_as_tensor_handles_characters_if_empty_field(self):
     field = TextField([], token_indexers={"characters": TokenCharactersIndexer("characters",
                                                                                min_padding_length=1)})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     expected_character_array = numpy.array([])
     numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(),
                                             expected_character_array)
Beispiel #20
0
 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or CharacterTokenizer()
     self._token_indexers = token_indexers or {
         'tokens': TokenCharactersIndexer()
     }
Beispiel #21
0
    def test_padding_lengths_are_computed_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters"),
                                          "words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}
    def test_count_vocab_items_respects_casing(self):
        indexer = TokenCharactersIndexer("characters")
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["characters"] == {"h": 1, "H": 1, "e": 2, "l": 4, "o": 2}

        indexer = TokenCharactersIndexer("characters", CharacterTokenizer(lowercase_characters=True))
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["characters"] == {"h": 2, "e": 2, "l": 4, "o": 2}
 def test_padding_lengths_are_computed_correctly(self):
     # pylint: disable=protected-access
     self.field.index(self.vocab)
     assert self.field.get_padding_lengths() == {'num_entities': 9, 'num_entity_tokens': 3,
                                                 'num_utterance_tokens': 4}
     self.field._token_indexers['token_characters'] = TokenCharactersIndexer()
     self.field.index(self.vocab)
     assert self.field.get_padding_lengths() == {'num_entities': 9, 'num_entity_tokens': 3,
                                                 'num_utterance_tokens': 4,
                                                 'num_token_characters': 9}
Beispiel #24
0
    def test_start_and_end_tokens(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace='characters')  # 2
        vocab.add_token_to_namespace("s", namespace='characters')  # 3
        vocab.add_token_to_namespace("e", namespace='characters')  # 4
        vocab.add_token_to_namespace("n", namespace='characters')  # 5
        vocab.add_token_to_namespace("t", namespace='characters')  # 6
        vocab.add_token_to_namespace("c", namespace='characters')  # 7
        vocab.add_token_to_namespace("<", namespace='characters')  # 8
        vocab.add_token_to_namespace(">", namespace='characters')  # 9
        vocab.add_token_to_namespace("/", namespace='characters')  # 10

        indexer = TokenCharactersIndexer("characters",
                                         start_tokens=["<s>"],
                                         end_tokens=["</s>"])
        indices = indexer.tokens_to_indices([Token("sentential")], vocab,
                                            "char")
        assert indices == {
            "char": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]]
        }
 def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy=False)
     self.tokenizer_space = WhitespaceTokenizer()
     self.tokenizer_spacy = SpacyTokenizer(language = "en_core_web_md", 
                                           pos_tags = True, split_on_spaces = True)
     self.token_indexers = {
         'elmo_tokens': ELMoTokenCharactersIndexer(),
         'token_characters': TokenCharactersIndexer(namespace='character_vocab',
                                                   min_padding_length=2),
         'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', default_value='NNP',
                                  feature_name='tag_')
     } 
     
     self.intent_indexers = {
         'elmo_tokens': ELMoTokenCharactersIndexer(),
         'token_characters': TokenCharactersIndexer(namespace='character_vocab',
                                                   min_padding_length=2),
         'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', default_value='NNP',
                                  feature_name='tag_')
     }
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace="words")
        capital_a_index = vocab.add_token_to_namespace("A", namespace="words")
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace="characters")
        s_index = vocab.add_token_to_namespace("s", namespace="characters")
        e_index = vocab.add_token_to_namespace("e", namespace="characters")
        n_index = vocab.add_token_to_namespace("n", namespace="characters")
        t_index = vocab.add_token_to_namespace("t", namespace="characters")
        c_index = vocab.add_token_to_namespace("c", namespace="characters")

        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"words": SingleIdTokenIndexer(namespace="words")},
        )
        field.index(vocab)

        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1)},
        )
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [
            [capital_a_char_index],
            [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index],
        ]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words": SingleIdTokenIndexer(namespace="words"),
                "characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1),
            },
        )
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [
            [capital_a_char_index],
            [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index],
        ]
Beispiel #27
0
 def test_as_array_handles_characters(self):
     field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                       token_indexers={"characters": TokenCharactersIndexer("characters")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     array_dict = field.as_array(padding_lengths)
     expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                             [1, 3, 0, 0, 0, 0, 0, 0],
                                             [1, 0, 0, 0, 0, 0, 0, 0],
                                             [3, 4, 5, 6, 4, 5, 7, 4],
                                             [1, 0, 0, 0, 0, 0, 0, 0]])
     numpy.testing.assert_array_almost_equal(array_dict["characters"], expected_character_array)
Beispiel #28
0
    def __init__(self, max_word_length=None):
        super().__init__(lazy=False)
        self.source_tokenizer = WordTokenizer(
            SpacyWordSplitter("es_core_news_sm"))
        self.target_tokenizer = WordTokenizer(
            SpacyWordSplitter("en_core_web_sm"),
            start_tokens=["BOS"],
            end_tokens=["EOS"],
        )

        self.source_token_indexers = {
            "token_characters":
            TokenCharactersIndexer(
                "char_src",
                min_padding_length=5,
                character_tokenizer=MyCharacterTokenizer(
                    max_length=max_word_length, ),
            ),
            "tokens":
            SingleIdTokenIndexer("token_src"),
        }
        self.target_token_indexers = {
            "token_characters":
            TokenCharactersIndexer(
                "char_trg",
                character_tokenizer=MyCharacterTokenizer(
                    max_length=max_word_length, ),
            ),
            "token_characters_output":
            TokenCharactersIndexer(
                "char_trg",
                character_tokenizer=MyCharacterTokenizer(
                    max_length=max_word_length,
                    start_tokens=["BOT"],
                    end_tokens=["EOT"]  # lul
                ),
            ),
            "tokens":
            SingleIdTokenIndexer("token_trg"),
        }
Beispiel #29
0
 def test_as_tensor_handles_characters(self):
     field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                       token_indexers={u"characters": TokenCharactersIndexer(u"characters")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                             [1, 3, 0, 0, 0, 0, 0, 0],
                                             [1, 0, 0, 0, 0, 0, 0, 0],
                                             [3, 4, 5, 6, 4, 5, 7, 4],
                                             [1, 0, 0, 0, 0, 0, 0, 0]])
     numpy.testing.assert_array_almost_equal(tensor_dict[u"characters"].detach().cpu().numpy(),
                                             expected_character_array)
 def __init__(self,
              window_size: int = 4,
              min_padding_length: int = 4,
              subsampling_threshold: float = 10e-5,
              lazy: bool = False) -> None:
     super().__init__(lazy=lazy)
     self._window_size = window_size
     self._subsampling_threshold = subsampling_threshold
     self._word_indexers = {'words': SingleIdTokenIndexer(namespace='words')}
     self._syllable_indexers = {
         'syllables': TokenCharactersIndexer(
             namespace='syllables', min_padding_length=min_padding_length)}
     self._word_sample_prob = None
Beispiel #31
0
def construct_reader(is_pretrain):
    character_tokenizer = CharacterTokenizer(byte_encoding="utf-8",
                                             start_tokens=[259],
                                             end_tokens=[260])
    token_character_indexer = TokenCharactersIndexer(character_tokenizer=character_tokenizer,
                                                     min_padding_length=5)
    token_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
    reader = FollowUpDataReader(token_indexer={
        "token_words": token_indexer
    }, char_indexer={
        "token_characters": token_character_indexer,
    }, is_pretrain=is_pretrain)
    return reader
Beispiel #32
0
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", "words")
        self.vocab.add_token_to_namespace("s", "characters")
        self.vocab.add_token_to_namespace("e", "characters")
        self.vocab.add_token_to_namespace("n", "characters")
        self.vocab.add_token_to_namespace("t", "characters")
        self.vocab.add_token_to_namespace("c", "characters")
        for label in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]:
            self.vocab.add_token_to_namespace(label, "labels")

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words":
            SingleIdTokenIndexer("words"),
            "characters":
            TokenCharactersIndexer("characters", min_padding_length=1),
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1],
                                                       self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field(
        )

        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        empty_list_field = ListField([text_field.empty_field()])
        empty_fields = {"list_tensor": empty_list_field}
        self.empty_instance = Instance(empty_fields)

        non_empty_list_field = ListField([text_field])
        non_empty_fields = {"list_tensor": non_empty_list_field}
        self.non_empty_instance = Instance(non_empty_fields)

        super().setUp()
Beispiel #33
0
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')
        for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']:
            self.vocab.add_token_to_namespace(label, 'labels')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words":
            SingleIdTokenIndexer("words"),
            "characters":
            TokenCharactersIndexer("characters", min_padding_length=1)
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1],
                                                       self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field(
        )

        tokenizer = WordTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        empty_list_field = ListField([text_field.empty_field()])
        empty_fields = {'list_tensor': empty_list_field}
        self.empty_instance = Instance(empty_fields)

        non_empty_list_field = ListField([text_field])
        non_empty_fields = {'list_tensor': non_empty_list_field}
        self.non_empty_instance = Instance(non_empty_fields)

        super(TestListField, self).setUp()
Beispiel #34
0
 def __init__(self,
              token_indexers=None,
              sentence_field_name='sentence',
              tags_field_name='tags',
              tag_namespace='tags'):
     if token_indexers is None:
         token_indexers = {
             'words': SingleIdTokenIndexer(namespace='tokens'),
             'chars': TokenCharactersIndexer(namespace='token_chars'),
         }
     self.token_indexers = token_indexers
     self.sentence_field_name = sentence_field_name
     self.tags_field_name = tags_field_name
     self.tag_namespace = tag_namespace
    def test_count_vocab_items_respects_casing(self):
        indexer = TokenCharactersIndexer("characters")
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["characters"] == {"h": 1, "H": 1, "e": 2, "l": 4, "o": 2}

        indexer = TokenCharactersIndexer("characters", CharacterTokenizer(lowercase_characters=True))
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)
        assert counter["characters"] == {"h": 2, "e": 2, "l": 4, "o": 2}