def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = " ".join( [x.replace("\n", " ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance + 1 tokenized_strings = [] logger.info("Creating dataset from all text in file: %s", file_path) for index in Tqdm.tqdm( range(0, len(tokenized_text) - num_tokens, num_tokens - 1)): tokenized_strings.append(tokenized_text[index:(index + num_tokens)]) else: tokenized_strings = [ self._tokenizer.tokenize(s) for s in instance_strings ] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) yield Instance({ 'input_tokens': input_field, 'output_tokens': output_field })
def setUp(self): super().setUp() token_indexer = {"tokens": SingleIdTokenIndexer()} field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence", "."]], token_indexer) field2 = TextField([ Token(t) for t in ["this", "is", "a", "different", "sentence", "."] ], token_indexer) field3 = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "."]], token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], token_indexer) self.instances = [ Instance({ "text1": field1, "text2": field2 }), Instance({ "text1": field3, "text2": field4 }) ]
def text_to_instance(self, sentence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ tokenized_string = self._tokenizer.tokenize(sentence) input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) return Instance({ 'input_tokens': input_field, 'output_tokens': output_field })
def get_instances(self): field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer) field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], self.token_indexer) field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [Instance({"text1": field1, "text2": field2}), Instance({"text1": field3, "text2": field4})] return instances
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_token_to_namespace("a", namespace="tokens1") original_vocab.add_token_to_namespace("b", namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField([Token(t) for t in ["a" "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field2 = TextField([Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text1": text_field1, "text2": text_field2})]) # Following 2 should give error: token1 is non-padded in original_vocab but not in instances params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": []}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": []}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=[], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1"]}) Vocabulary.from_params(params, instances) extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1"]}) extended_vocab.extend_from_instances(params, instances) extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should give error: token1 is padded in instances but not in original_vocab params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens2"]}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
def text_to_instance( self, # type: ignore premise: str, hypothesis: str, label: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokenized_premise = self._tokenizer.tokenize(premise) tokenized_hypothesis = self._tokenizer.tokenize(hypothesis) fields["premise"] = TextField(tokenized_premise, self._token_indexers) fields["hypothesis"] = TextField(tokenized_hypothesis, self._token_indexers) if label is not None: fields['label'] = LabelField(label) return Instance(fields)
def text_to_instance( self, # type: ignore tokens: List[Token], pos_tags: List[str] = None, chunk_tags: List[str] = None, ner_tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField( {"words": [x.text for x in tokens]}) # Recode the labels if necessary. if self.coding_scheme == "BIOUL": coded_chunks = to_bioul( chunk_tags) if chunk_tags is not None else None coded_ner = to_bioul(ner_tags) if ner_tags is not None else None else: # the default IOB1 coded_chunks = chunk_tags coded_ner = ner_tags # Add "feature labels" to instance if 'pos' in self.feature_labels: if pos_tags is None: raise ConfigurationError( "Dataset reader was specified to use pos_tags as " "features. Pass them to text_to_instance.") instance_fields['pos_tags'] = SequenceLabelField( pos_tags, sequence, "pos_tags") if 'chunk' in self.feature_labels: if coded_chunks is None: raise ConfigurationError( "Dataset reader was specified to use chunk tags as " "features. Pass them to text_to_instance.") instance_fields['chunk_tags'] = SequenceLabelField( coded_chunks, sequence, "chunk_tags") if 'ner' in self.feature_labels: if coded_ner is None: raise ConfigurationError( "Dataset reader was specified to use NER tags as " " features. Pass them to text_to_instance.") instance_fields['ner_tags'] = SequenceLabelField( coded_ner, sequence, "ner_tags") # Add "tag label" to instance if self.tag_label == 'ner' and coded_ner is not None: instance_fields['tags'] = SequenceLabelField( coded_ner, sequence, self.label_namespace) elif self.tag_label == 'pos' and pos_tags is not None: instance_fields['tags'] = SequenceLabelField( pos_tags, sequence, self.label_namespace) elif self.tag_label == 'chunk' and coded_chunks is not None: instance_fields['tags'] = SequenceLabelField( coded_chunks, sequence, self.label_namespace) return Instance(instance_fields)
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: """ Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def setUp(self): token_indexer = SingleIdTokenIndexer("tokens") text_field = TextField([Token(t) for t in ["a", "a", "a", "a", "b", "b", "c", "c", "c"]], {"tokens": token_indexer}) self.instance = Instance({"text": text_field}) self.dataset = Batch([self.instance]) super(TestVocabulary, self).setUp()
def text_to_instance( self, # type: ignore tokens: List[Token], verb_label: List[int], tags: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} text_field = TextField(tokens, token_indexers=self._token_indexers) fields['tokens'] = text_field fields['verb_indicator'] = SequenceLabelField(verb_label, text_field) if tags: fields['tags'] = SequenceLabelField(tags, text_field) if all([x == 0 for x in verb_label]): verb = None else: verb = tokens[verb_label.index(1)].text fields["metadata"] = MetadataField({ "words": [x.text for x in tokens], "verb": verb }) return Instance(fields)
def text_to_instance( self, # type: ignore tokens: List[str], ccg_categories: List[str] = None, original_pos_tags: List[str] = None, modified_pos_tags: List[str] = None, predicate_arg_categories: List[str] = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. ccg_categories : ``List[str]``, optional, (default = None). The CCG categories for the words in the sentence. (e.g. N/N) original_pos_tags : ``List[str]``, optional, (default = None). The tag assigned to the word in the Penn Treebank. modified_pos_tags : ``List[str]``, optional, (default = None). The POS tag might have changed during the translation to CCG. predicate_arg_categories : ``List[str]``, optional, (default = None). Encodes the word-word dependencies in the underlying predicate- argument structure. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. ccg_categories : ``SequenceLabelField`` The CCG categories (only if supplied) original_pos_tags : ``SequenceLabelField`` Original POS tag (only if supplied) modified_pos_tags : ``SequenceLabelField`` Modified POS tag (only if supplied) predicate_arg_categories : ``SequenceLabelField`` Predicate-argument categories (only if supplied) """ # pylint: disable=arguments-differ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} for field_name, labels in (('ccg_categories', ccg_categories), ('original_pos_tags', original_pos_tags), ('modified_pos_tags', modified_pos_tags), ('predicate_arg_categories', predicate_arg_categories)): if labels is not None: # end namespace in labels so Vocabulary doesn't add PAD and UNK namespace = self._label_namespace_prefix + field_name + '_labels' fields[field_name] = SequenceLabelField( labels, text_field, namespace) return Instance(fields)
def text_to_instance(self, tokens: List[Token], tags: List[str] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) if tags is not None: fields["tags"] = SequenceLabelField(tags, sequence) return Instance(fields)
def text_to_instance( self, # type: ignore premise: str, hypothesis: str, label: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} premise_tokens = self._tokenizer.tokenize(premise) hypothesis_tokens = self._tokenizer.tokenize(hypothesis) fields['premise'] = TextField(premise_tokens, self._token_indexers) fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) if label: fields['label'] = LabelField(label) metadata = { "premise_tokens": [x.text for x in premise_tokens], "hypothesis_tokens": [x.text for x in hypothesis_tokens] } fields["metadata"] = MetadataField(metadata) return Instance(fields)
def text_to_instance( self, source_string: str, target_string: str = None) -> Instance: # type: ignore # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) if self._source_add_start_token: tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) return Instance({ "source_tokens": source_field, "target_tokens": target_field }) else: return Instance({'source_tokens': source_field})
def test_as_tensor_handles_words(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1]))
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters") } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field( ) super(TestListField, self).setUp()
def test_max_vocab_size_partial_dict(self): indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer()} instance = Instance({ 'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers) }) dataset = Batch([instance]) params = Params({ "max_vocab_size": { "tokens": 1 } }) vocab = Vocabulary.from_params(params=params, instances=dataset) assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2 assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. iterator = BasicIterator(3) iterator.index_with(vocab) for i, batch in enumerate( iterator(instances, num_epochs=1, shuffle=False)): lm_embeddings = elmo_bilm(batch['elmo']['character_ids']) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings['activations'][2], lm_embeddings['mask']) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [ expected_lm_embeddings[k][i] for k in range(3) ] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6))
def text_to_instance(self, tokens: List[str], sentiment: str = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. sentiment ``str``, optional, (default = None). The sentiment for this sentence. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence or phrase. label : ``LabelField`` The sentiment label of the sentence or phrase. """ # pylint: disable=arguments-differ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} if sentiment is not None: # 0 and 1 are negative sentiment, 2 is neutral, and 3 and 4 are positive sentiment # In 5-class, we use labels as is. # 3-class reduces the granularity, and only asks the model to predict # negative, neutral, or positive. # 2-class further reduces the granularity by only asking the model to # predict whether an instance is negative or positive. if self._granularity == "3-class": if int(sentiment) < 2: sentiment = "0" elif int(sentiment) == 2: sentiment = "1" else: sentiment = "2" elif self._granularity == "2-class": if int(sentiment) < 2: sentiment = "0" elif int(sentiment) == 2: return None else: sentiment = "1" fields['label'] = LabelField(sentiment) return Instance(fields)
def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]): instances = [] indexer = ELMoTokenCharactersIndexer() indexer2 = SingleIdTokenIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, { 'character_ids': indexer, 'tokens': indexer2 }) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary.from_instances(instances) dataset.index_instances(vocab) return vocab, dataset.as_tensor_dict()["elmo"]
def test_as_tensor_handles_characters(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(), expected_character_array)
def text_to_instance( self, # type: ignore words: List[str], upos_tags: List[str], dependencies: List[Tuple[str, int]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- words : ``List[str]``, required. The words in the sentence to be encoded. upos_tags : ``List[str]``, required. The universal dependencies POS tags for each word. dependencies ``List[Tuple[str, int]]``, optional (default = None) A list of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. Returns ------- An instance containing words, upos tags, dependency head tags and head indices as fields. """ fields: Dict[str, Field] = {} tokens = TextField([Token(w) for w in words], self._token_indexers) fields["words"] = tokens fields["pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace="pos") if dependencies is not None: # We don't want to expand the label namespace with an additional dummy token, so we'll # always give the 'ROOT_HEAD' token a label of 'root'. fields["head_tags"] = SequenceLabelField( [x[0] for x in dependencies], tokens, label_namespace="head_tags") fields["head_indices"] = SequenceLabelField( [int(x[1]) for x in dependencies], tokens, label_namespace="head_index_tags") fields["metadata"] = MetadataField({"words": words, "pos": upos_tags}) return Instance(fields)
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding='utf-8') token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Batch([Instance({"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access vocab_dir = self.TEST_DIR / 'vocab_save' vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access assert indexed_tokens == indexed_tokens2
def test_from_params_extend_config(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens"]) original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.save_to_files(vocab_dir) text_field = TextField([Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}) instances = Batch([Instance({"text": text_field})]) # If you ask to extend vocab from `directory_path`, instances must be passed # in Vocabulary constructor, or else there is nothing to extend to. params = Params({"directory_path": vocab_dir, "extend": True}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params) # If you ask to extend vocab, `directory_path` key must be present in params, # or else there is nothing to extend from. params = Params({"extend": True}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances)
def test_as_tensor_handles_words_and_characters_with_longer_lengths(self): field = TextField([Token(t) for t in ["a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["num_tokens"] = 5 padding_lengths["num_token_characters"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 2, 1, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(), numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
def test_token_embedder_returns_dict(self): field = TextField([Token(t) for t in ["A", "sentence"]], token_indexers={"field_with_dict": DictReturningTokenIndexer(), "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { 'token_ids': 5, 'additional_key': 2, 'words': 2, 'characters': 2, 'num_token_characters': 8 } padding_lengths['token_ids'] = 7 padding_lengths['additional_key'] = 3 padding_lengths['words'] = 4 padding_lengths['characters'] = 4 tensors = field.as_tensor(padding_lengths) assert list(tensors['token_ids'].shape) == [7] assert list(tensors['additional_key'].shape) == [3] assert list(tensors['words'].shape) == [4] assert list(tensors['characters'].shape) == [4, 8]
def text_to_instance( self, # type: ignore tokens: List[str], pos_tags: List[str] = None, gold_tree: Tree = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. Parameters ---------- tokens : ``List[str]``, required. The tokens in a given sentence. pos_tags ``List[str]``, optional, (default = None). The POS tags for the words in the sentence. gold_tree : ``Tree``, optional (default = None). The gold parse tree to create span labels from. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence. pos_tags : ``SequenceLabelField`` The POS tags of the words in the sentence. Only returned if ``use_pos_tags`` is ``True`` spans : ``ListField[SpanField]`` A ListField containing all possible subspans of the sentence. span_labels : ``SequenceLabelField``, optional. The constiutency tags for each of the possible spans, with respect to a gold parse tree. If a span is not contained within the tree, a span will have a ``NO-LABEL`` label. gold_tree : ``MetadataField(Tree)`` The gold NLTK parse tree for use in evaluation. """ # pylint: disable=arguments-differ text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} pos_namespace = self._label_namespace_prefix + self._pos_label_namespace if self._use_pos_tags and pos_tags is not None: pos_tag_field = SequenceLabelField(pos_tags, text_field, label_namespace=pos_namespace) fields["pos_tags"] = pos_tag_field elif self._use_pos_tags: raise ConfigurationError( "use_pos_tags was set to True but no gold pos" " tags were passed to the dataset reader.") spans: List[Field] = [] gold_labels = [] if gold_tree is not None: gold_spans: Dict[Tuple[int, int], str] = {} self._get_gold_spans(gold_tree, 0, gold_spans) else: gold_spans = None for start, end in enumerate_spans(tokens): spans.append(SpanField(start, end, text_field)) if gold_spans is not None: if (start, end) in gold_spans.keys(): gold_labels.append(gold_spans[(start, end)]) else: gold_labels.append("NO-LABEL") metadata = {"tokens": tokens} if gold_tree: metadata["gold_tree"] = gold_tree if self._use_pos_tags: metadata["pos_tags"] = pos_tags fields["metadata"] = MetadataField(metadata) span_list_field: ListField = ListField(spans) fields["spans"] = span_list_field if gold_tree is not None: fields["span_labels"] = SequenceLabelField( gold_labels, span_list_field, label_namespace=self._label_namespace_prefix + "labels") return Instance(fields)
def test_field_counts_vocab_items_correctly(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert list(namespace_token_counts.keys()) == ["words"] field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert list(namespace_token_counts.keys()) == ["characters"] field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert set(namespace_token_counts.keys()) == {"words", "characters"}
def test_printing_doesnt_crash(self): field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) print(field)