def test_min_padding_length(self): sentence = "AllenNLP is awesome ." tokens = [Token(token) for token in sentence.split(" ")] vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace="characters") # 2 vocab.add_token_to_namespace("l", namespace="characters") # 3 vocab.add_token_to_namespace("e", namespace="characters") # 4 vocab.add_token_to_namespace("n", namespace="characters") # 5 vocab.add_token_to_namespace("N", namespace="characters") # 6 vocab.add_token_to_namespace("L", namespace="characters") # 7 vocab.add_token_to_namespace("P", namespace="characters") # 8 vocab.add_token_to_namespace("i", namespace="characters") # 9 vocab.add_token_to_namespace("s", namespace="characters") # 10 vocab.add_token_to_namespace("a", namespace="characters") # 11 vocab.add_token_to_namespace("w", namespace="characters") # 12 vocab.add_token_to_namespace("o", namespace="characters") # 13 vocab.add_token_to_namespace("m", namespace="characters") # 14 vocab.add_token_to_namespace(".", namespace="characters") # 15 indexer = TokenCharactersIndexer("characters", min_padding_length=10) indices = indexer.tokens_to_indices(tokens, vocab, "char") key_padding_lengths = "num_token_characters" value_padding_lengths = 0 for token in indices["char"]: item = indexer.get_padding_lengths(token) value = item.values() value_padding_lengths = max(value_padding_lengths, max(value)) padded = indexer.pad_token_sequence(indices, {"char": len(indices["char"])}, {key_padding_lengths: value_padding_lengths}) assert padded == {"char": [[2, 3, 3, 4, 5, 6, 7, 8, 0, 0], [9, 10, 0, 0, 0, 0, 0, 0, 0, 0], [11, 12, 4, 10, 13, 14, 4, 0, 0, 0], [15, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}
def test_as_array_produces_token_sequence(self): indexer = TokenCharactersIndexer("characters") padded_tokens = indexer.pad_token_sequence([[1, 2, 3, 4, 5], [1, 2, 3], [1]], desired_num_tokens=4, padding_lengths={"num_token_characters": 10}) assert padded_tokens == [[1, 2, 3, 4, 5, 0, 0, 0, 0, 0], [1, 2, 3, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
def test_token_to_indices_produces_correct_characters(self): vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace='characters') vocab.add_token_to_namespace("s", namespace='characters') vocab.add_token_to_namespace("e", namespace='characters') vocab.add_token_to_namespace("n", namespace='characters') vocab.add_token_to_namespace("t", namespace='characters') vocab.add_token_to_namespace("c", namespace='characters') indexer = TokenCharactersIndexer("characters") indices = indexer.token_to_indices(Token("sentential"), vocab) assert indices == [3, 4, 5, 6, 4, 5, 6, 1, 1, 1]
def test_token_padding_lengths_are_computed_correctly(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(token_min_padding_length=3), "words": SingleIdTokenIndexer("words", token_min_padding_length=3), "characters": TokenCharactersIndexer("characters", min_padding_length=1, token_min_padding_length=3) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { 'token_ids_length': 5, 'additional_key_length': 3, 'words_length': 3, 'characters_length': 3, 'num_token_characters': 8, 'num_tokens': 5, } tensors = field.as_tensor(padding_lengths) assert tensors['additional_key'].tolist()[-1] == 0 assert tensors['words'].tolist()[-1] == 0 assert tensors['characters'].tolist()[-1] == [0] * 8
def test_token_indexer_returns_dict(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(), "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { 'token_ids_length': 5, 'additional_key_length': 2, 'words_length': 2, 'characters_length': 2, 'num_token_characters': 8, 'num_tokens': 5, } padding_lengths['token_ids_length'] = 7 padding_lengths['additional_key_length'] = 3 padding_lengths['words_length'] = 4 padding_lengths['characters_length'] = 4 tensors = field.as_tensor(padding_lengths) assert list(tensors['token_ids'].shape) == [7] assert list(tensors['additional_key'].shape) == [3] assert list(tensors['words'].shape) == [4] assert list(tensors['characters'].shape) == [4, 8]
def test_as_tensor_handles_words_and_characters_with_longer_lengths(self): field = TextField( [Token(t) for t in ["a", "sentence", "."]], token_indexers={ "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["words_length"] = 5 padding_lengths["characters_length"] = 5 padding_lengths["num_token_characters"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 2, 1, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["characters"].detach().cpu().numpy(), numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
def main(): dataset_reader = CopyNetDatasetReader( target_namespace='target_tokens', source_token_indexers={ 'tokens': SingleIdTokenIndexer(namespace='source_tokens'), 'token_characters': TokenCharactersIndexer() }) model_archive = load_archive( archive_file='checkpoints/model.tar.gz', cuda_device=-1, weights_file='checkpoints/model_state_epoch_28.th') model = model_archive.model model.eval() predictor = Seq2SeqPredictor(model=model, dataset_reader=dataset_reader) val_file = open('snips/val.tsv') for line in val_file: source, target = line.strip().split('\t') print('Gold Target: {}'.format( target.replace('OPEN', '(').replace('CLOSE', ')'))) predicted_tokens = predictor.predict(target)['predicted_tokens'][0] print('Predictions: {}'.format(' '.join(predicted_tokens)).replace( 'OPEN', '(').replace('CLOSE', ')') + '\n')
def test_token_padding_lengths_are_computed_correctly(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(token_min_padding_length=3), "words": SingleIdTokenIndexer("words", token_min_padding_length=3), "characters": TokenCharactersIndexer("characters", min_padding_length=1, token_min_padding_length=3), }, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "field_with_dict___token_ids": 5, "field_with_dict___additional_key": 3, "words___tokens": 3, "characters___token_characters": 3, "characters___num_token_characters": 8, } tensors = field.as_tensor(padding_lengths) assert tensors["field_with_dict"]["additional_key"].tolist()[-1] == 0 assert tensors["words"]["tokens"].tolist()[-1] == 0 assert tensors["characters"]["token_characters"].tolist()[-1] == [0 ] * 8
def build_indexers(args): indexers = {} if not args.word_embs == 'none': indexers["words"] = SingleIdTokenIndexer() if args.elmo: indexers["elmo"] = ELMoTokenCharactersIndexer("elmo") assert args.tokenizer in {"", "MosesTokenizer"} if args.char_embs: indexers["chars"] = TokenCharactersIndexer("chars") if args.cove: assert args.tokenizer == "MosesTokenizer", \ (f"CoVe model expects Moses tokenization (MosesTokenizer);" " you are using args.tokenizer = {args.tokenizer}") if args.openai_transformer: assert not indexers, ("OpenAI transformer is not supported alongside" " other indexers due to tokenization!") assert args.tokenizer == "OpenAI.BPE", \ ("OpenAI transformer is not supported alongside" " other indexers due to tokenization!") indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer( "openai_bpe") if args.bert_model_name: assert not indexers, ("BERT is not supported alongside" " other indexers due to tokenization!") assert args.tokenizer == args.bert_model_name, \ ("BERT models use custom WPM tokenization for " "each model, so tokenizer must match the " "specified BERT model.") indexers["bert_wpm_pretokenized"] = SingleIdTokenIndexer( args.bert_model_name) return indexers
def test_token_indexer_returns_dict(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(), "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1), }, ) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "field_with_dict___token_ids": 5, "field_with_dict___additional_key": 2, "words___tokens": 2, "characters___token_characters": 2, "characters___num_token_characters": 8, } padding_lengths["field_with_dict___token_ids"] = 7 padding_lengths["field_with_dict___additional_key"] = 3 padding_lengths["words___tokens"] = 4 padding_lengths["characters___token_characters"] = 4 tensors = field.as_tensor(padding_lengths) assert list(tensors["field_with_dict"]["token_ids"].shape) == [7] assert list(tensors["field_with_dict"]["additional_key"].shape) == [3] assert list(tensors["words"]["tokens"].shape) == [4] assert list(tensors["characters"]["token_characters"].shape) == [4, 8]
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace(u"this", u"words") self.vocab.add_token_to_namespace(u"is", u"words") self.vocab.add_token_to_namespace(u"a", u"words") self.vocab.add_token_to_namespace(u"sentence", u'words') self.vocab.add_token_to_namespace(u"s", u'characters') self.vocab.add_token_to_namespace(u"e", u'characters') self.vocab.add_token_to_namespace(u"n", u'characters') self.vocab.add_token_to_namespace(u"t", u'characters') self.vocab.add_token_to_namespace(u"c", u'characters') for label in [u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', u'k']: self.vocab.add_token_to_namespace(label, u'labels') self.word_indexer = {u"words": SingleIdTokenIndexer(u"words")} self.words_and_characters_indexers = {u"words": SingleIdTokenIndexer(u"words"), u"characters": TokenCharactersIndexer(u"characters")} self.field1 = TextField([Token(t) for t in [u"this", u"is", u"a", u"sentence"]], self.word_indexer) self.field2 = TextField([Token(t) for t in [u"this", u"is", u"a", u"different", u"sentence"]], self.word_indexer) self.field3 = TextField([Token(t) for t in [u"this", u"is", u"another", u"sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() super(TestListField, self).setUp()
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexer = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters") } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) super(TestListField, self).setUp()
def test_with_token_characters_indexer(self): inputs = {"sentence": "I always write unit tests for my code."} archive = load_archive(self.FIXTURES_ROOT / "basic_classifier" / "serialization" / "model.tar.gz") predictor = Predictor.from_archive(archive) predictor._dataset_reader._token_indexers[ "chars"] = TokenCharactersIndexer(min_padding_length=1) predictor._model._text_field_embedder._token_embedders[ "chars"] = EmptyEmbedder() hotflipper = Hotflip(predictor) hotflipper.initialize() attack = hotflipper.attack_from_json(inputs, "tokens", "grad_input_1") assert attack is not None assert "final" in attack assert "original" in attack assert "outputs" in attack assert len(attack["final"][0]) == len( attack["original"]) # hotflip replaces words without removing # This checks for a bug that arose with a change in the pytorch API. We want to be sure we # can handle the case where we have to re-encode a vocab item because we didn't save it in # our fake embedding matrix (see Hotflip docstring for more info). hotflipper = Hotflip(predictor, max_tokens=50) hotflipper.initialize() hotflipper._first_order_taylor(grad=torch.rand((10, )).numpy(), token_idx=torch.tensor(60), sign=1)
def build_indexers(args): indexers = {} if args.input_module in ["scratch", "glove", "fastText"]: indexers["words"] = SingleIdTokenIndexer() elif args.input_module in ["elmo", "elmo-chars-only"]: indexers["elmo"] = ELMoTokenCharactersIndexer("elmo") assert args.tokenizer in {"", "MosesTokenizer"} if args.char_embs: indexers["chars"] = TokenCharactersIndexer("chars") if args.cove: assert args.tokenizer == "MosesTokenizer", ( f"CoVe model expects Moses tokenization (MosesTokenizer);" " you are using args.tokenizer = {args.tokenizer}") if input_module_uses_transformers(args.input_module): assert ( not indexers ), "transformers modules like BERT/XLNet are not supported alongside other " "indexers due to tokenization." assert args.tokenizer == args.input_module, ( "transformers models use custom tokenization for each model, so tokenizer " "must match the specified model.") tokenizer_name = input_module_tokenizer_name(args.input_module) indexers[tokenizer_name] = SingleIdTokenIndexer(tokenizer_name) return indexers
def from_params(cls, params: Params) -> "PnetOntoDatasetReader": # token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) token_indexers = { "tokens": SingleIdTokenIndexer(lowercase_tokens=True), "token_characters": TokenCharactersIndexer(), "elmo": ELMoTokenCharactersIndexer(), } valid_class = params.pop("valid_class") random_seed = params.pop("random_seed") drop_empty = params.pop("drop_empty") valid_part = params.pop("valid_part") tag_label = params.pop("tag_label", None) feature_labels = params.pop("feature_labels", ()) lazy = params.pop("lazy", False) params.assert_empty(cls.__name__) return PnetOntoDatasetReader( token_indexers=token_indexers, valid_class=valid_class, random_seed=random_seed, drop_empty=drop_empty, valid_part=valid_part, tag_label=tag_label, feature_labels=feature_labels, lazy=lazy, )
def __init__(self, tokenizer: Tokenizer = None, name_token_indexers: Dict[str, TokenIndexer] = None, token_only_indexer: Dict[str, TokenIndexer] = None) -> None: self._name_token_indexers = name_token_indexers or \ {'tokens': SingleIdTokenIndexer(namespace="tokens"), 'token_characters': TokenCharactersIndexer(namespace="token_characters")} self._token_only_indexer = token_only_indexer or \ {'tokens': SingleIdTokenIndexer(namespace="tokens")} self._tokenizer = tokenizer or WordTokenizer() self._empty_token_text_field = TextField( self._tokenizer.tokenize('00000'), self._token_only_indexer) self._empty_list_token_text_field = ListField([ TextField(self._tokenizer.tokenize('00000'), self._token_only_indexer) ]) self.PARENT_REL_LABELS = constants.UMLS_PARENT_REL_LABELS self.CHILD_REL_LABELS = constants.UMLS_CHILD_REL_LABELS self.STOP = set(stopwords.words('english')) self.tokenizer = RegexpTokenizer(r'[A-Za-z\d]+') self.stemmer = SnowballStemmer("english") self.lemmatizer = WordNetLemmatizer() self.nlp = spacy.load('en')
def build_indexers(args): indexers = {} if not args.input_module.startswith("bert") and args.input_module not in [ "elmo", "gpt" ]: indexers["words"] = SingleIdTokenIndexer() if args.input_module == "elmo": indexers["elmo"] = ELMoTokenCharactersIndexer("elmo") assert args.tokenizer in {"", "MosesTokenizer"} if args.char_embs: indexers["chars"] = TokenCharactersIndexer("chars") if args.cove: assert args.tokenizer == "MosesTokenizer", ( f"CoVe model expects Moses tokenization (MosesTokenizer);" " you are using args.tokenizer = {args.tokenizer}") if args.input_module == "gpt": assert ( not indexers ), "OpenAI transformer is not supported alongside other indexers due to tokenization." assert ( args.tokenizer == "OpenAI.BPE" ), "OpenAI transformer uses custom BPE tokenization. Set tokenizer=OpenAI.BPE." indexers["openai_bpe_pretokenized"] = SingleIdTokenIndexer( "openai_bpe") if args.input_module.startswith("bert"): assert not indexers, "BERT is not supported alongside other indexers due to tokenization." assert args.tokenizer == args.input_module, ( "BERT models use custom WPM tokenization for " "each model, so tokenizer must match the " "specified BERT model.") indexers["bert_wpm_pretokenized"] = SingleIdTokenIndexer( args.input_module) return indexers
def test_start_and_end_tokens(self): vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace='characters') # 2 vocab.add_token_to_namespace("s", namespace='characters') # 3 vocab.add_token_to_namespace("e", namespace='characters') # 4 vocab.add_token_to_namespace("n", namespace='characters') # 5 vocab.add_token_to_namespace("t", namespace='characters') # 6 vocab.add_token_to_namespace("c", namespace='characters') # 7 vocab.add_token_to_namespace("<", namespace='characters') # 8 vocab.add_token_to_namespace(">", namespace='characters') # 9 vocab.add_token_to_namespace("/", namespace='characters') # 10 indexer = TokenCharactersIndexer("characters", start_tokens=["<s>"], end_tokens=["</s>"]) indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char") assert indices == {"char": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]]}
def test_as_tensor_handles_characters_if_empty_field(self): field = TextField([], token_indexers={"characters": TokenCharactersIndexer("characters", min_padding_length=1)}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([]) numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(), expected_character_array)
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or CharacterTokenizer() self._token_indexers = token_indexers or { 'tokens': TokenCharactersIndexer() }
def test_padding_lengths_are_computed_correctly(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters"), "words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}
def test_count_vocab_items_respects_casing(self): indexer = TokenCharactersIndexer("characters") counter = defaultdict(lambda: defaultdict(int)) indexer.count_vocab_items(Token("Hello"), counter) indexer.count_vocab_items(Token("hello"), counter) assert counter["characters"] == {"h": 1, "H": 1, "e": 2, "l": 4, "o": 2} indexer = TokenCharactersIndexer("characters", CharacterTokenizer(lowercase_characters=True)) counter = defaultdict(lambda: defaultdict(int)) indexer.count_vocab_items(Token("Hello"), counter) indexer.count_vocab_items(Token("hello"), counter) assert counter["characters"] == {"h": 2, "e": 2, "l": 4, "o": 2}
def test_padding_lengths_are_computed_correctly(self): # pylint: disable=protected-access self.field.index(self.vocab) assert self.field.get_padding_lengths() == {'num_entities': 9, 'num_entity_tokens': 3, 'num_utterance_tokens': 4} self.field._token_indexers['token_characters'] = TokenCharactersIndexer() self.field.index(self.vocab) assert self.field.get_padding_lengths() == {'num_entities': 9, 'num_entity_tokens': 3, 'num_utterance_tokens': 4, 'num_token_characters': 9}
def test_start_and_end_tokens(self): vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace='characters') # 2 vocab.add_token_to_namespace("s", namespace='characters') # 3 vocab.add_token_to_namespace("e", namespace='characters') # 4 vocab.add_token_to_namespace("n", namespace='characters') # 5 vocab.add_token_to_namespace("t", namespace='characters') # 6 vocab.add_token_to_namespace("c", namespace='characters') # 7 vocab.add_token_to_namespace("<", namespace='characters') # 8 vocab.add_token_to_namespace(">", namespace='characters') # 9 vocab.add_token_to_namespace("/", namespace='characters') # 10 indexer = TokenCharactersIndexer("characters", start_tokens=["<s>"], end_tokens=["</s>"]) indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char") assert indices == { "char": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]] }
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self.tokenizer_space = WhitespaceTokenizer() self.tokenizer_spacy = SpacyTokenizer(language = "en_core_web_md", pos_tags = True, split_on_spaces = True) self.token_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=2), 'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', default_value='NNP', feature_name='tag_') } self.intent_indexers = { 'elmo_tokens': ELMoTokenCharactersIndexer(), 'token_characters': TokenCharactersIndexer(namespace='character_vocab', min_padding_length=2), 'pos_tags': SingleIdTokenIndexer(namespace='pos_tag_vocab', default_value='NNP', feature_name='tag_') }
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace="words") capital_a_index = vocab.add_token_to_namespace("A", namespace="words") capital_a_char_index = vocab.add_token_to_namespace("A", namespace="characters") s_index = vocab.add_token_to_namespace("s", namespace="characters") e_index = vocab.add_token_to_namespace("e", namespace="characters") n_index = vocab.add_token_to_namespace("n", namespace="characters") t_index = vocab.add_token_to_namespace("t", namespace="characters") c_index = vocab.add_token_to_namespace("c", namespace="characters") field = TextField( [Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}, ) field.index(vocab) assert field._indexed_tokens["words"] == [capital_a_index, sentence_index] field1 = TextField( [Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1)}, ) field1.index(vocab) assert field1._indexed_tokens["characters"] == [ [capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index], ] field2 = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1), }, ) field2.index(vocab) assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"] == [ [capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index], ]
def test_as_array_handles_characters(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() array_dict = field.as_array(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal(array_dict["characters"], expected_character_array)
def __init__(self, max_word_length=None): super().__init__(lazy=False) self.source_tokenizer = WordTokenizer( SpacyWordSplitter("es_core_news_sm")) self.target_tokenizer = WordTokenizer( SpacyWordSplitter("en_core_web_sm"), start_tokens=["BOS"], end_tokens=["EOS"], ) self.source_token_indexers = { "token_characters": TokenCharactersIndexer( "char_src", min_padding_length=5, character_tokenizer=MyCharacterTokenizer( max_length=max_word_length, ), ), "tokens": SingleIdTokenIndexer("token_src"), } self.target_token_indexers = { "token_characters": TokenCharactersIndexer( "char_trg", character_tokenizer=MyCharacterTokenizer( max_length=max_word_length, ), ), "token_characters_output": TokenCharactersIndexer( "char_trg", character_tokenizer=MyCharacterTokenizer( max_length=max_word_length, start_tokens=["BOT"], end_tokens=["EOT"] # lul ), ), "tokens": SingleIdTokenIndexer("token_trg"), }
def test_as_tensor_handles_characters(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"characters": TokenCharactersIndexer(u"characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal(tensor_dict[u"characters"].detach().cpu().numpy(), expected_character_array)
def __init__(self, window_size: int = 4, min_padding_length: int = 4, subsampling_threshold: float = 10e-5, lazy: bool = False) -> None: super().__init__(lazy=lazy) self._window_size = window_size self._subsampling_threshold = subsampling_threshold self._word_indexers = {'words': SingleIdTokenIndexer(namespace='words')} self._syllable_indexers = { 'syllables': TokenCharactersIndexer( namespace='syllables', min_padding_length=min_padding_length)} self._word_sample_prob = None
def construct_reader(is_pretrain): character_tokenizer = CharacterTokenizer(byte_encoding="utf-8", start_tokens=[259], end_tokens=[260]) token_character_indexer = TokenCharactersIndexer(character_tokenizer=character_tokenizer, min_padding_length=5) token_indexer = SingleIdTokenIndexer(lowercase_tokens=True) reader = FollowUpDataReader(token_indexer={ "token_words": token_indexer }, char_indexer={ "token_characters": token_character_indexer, }, is_pretrain=is_pretrain) return reader
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", "words") self.vocab.add_token_to_namespace("s", "characters") self.vocab.add_token_to_namespace("e", "characters") self.vocab.add_token_to_namespace("n", "characters") self.vocab.add_token_to_namespace("t", "characters") self.vocab.add_token_to_namespace("c", "characters") for label in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]: self.vocab.add_token_to_namespace(label, "labels") self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1), } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field( ) tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) empty_list_field = ListField([text_field.empty_field()]) empty_fields = {"list_tensor": empty_list_field} self.empty_instance = Instance(empty_fields) non_empty_list_field = ListField([text_field]) non_empty_fields = {"list_tensor": non_empty_list_field} self.non_empty_instance = Instance(non_empty_fields) super().setUp()
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1) } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field( ) tokenizer = WordTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) empty_list_field = ListField([text_field.empty_field()]) empty_fields = {'list_tensor': empty_list_field} self.empty_instance = Instance(empty_fields) non_empty_list_field = ListField([text_field]) non_empty_fields = {'list_tensor': non_empty_list_field} self.non_empty_instance = Instance(non_empty_fields) super(TestListField, self).setUp()
def __init__(self, token_indexers=None, sentence_field_name='sentence', tags_field_name='tags', tag_namespace='tags'): if token_indexers is None: token_indexers = { 'words': SingleIdTokenIndexer(namespace='tokens'), 'chars': TokenCharactersIndexer(namespace='token_chars'), } self.token_indexers = token_indexers self.sentence_field_name = sentence_field_name self.tags_field_name = tags_field_name self.tag_namespace = tag_namespace