def test_instance_implements_mutable_mapping(self): words_field = TextField([Token("hello")], {}) label_field = LabelField(1, skip_indexing=True) instance = Instance({"words": words_field, "labels": label_field}) assert instance["words"] == words_field assert instance["labels"] == label_field assert len(instance) == 2 keys = {k for k, v in instance.items()} assert keys == {"words", "labels"} values = [v for k, v in instance.items()] assert words_field in values assert label_field in values
def text_to_instance(self, comment_text: str, sentiment: int = None) -> Instance: tokens = self._tokenizer.tokenize(comment_text) if self._max_seq_len is not None: tokens = tokens[:self._max_seq_len] sentence_field = TextField(tokens, self._token_indexers) fields = {"tokens": sentence_field} label_field = LabelField(str(sentiment)) fields["label"] = label_field return Instance(fields)
def text_to_instance(self, tokens: List[Token], id: str = None, labels: np.ndarray = None) -> Instance: sentence_field = TextField(tokens, self.token_indexers) fields = {"tokens": sentence_field} id_field = MetadataField(id) fields["id"] = id_field if labels is None: labels = np.zeros(len(label_cols)) label_field = ArrayField(array=labels) fields["label"] = label_field return Instance(fields)
def text_to_instance(self, text: str, hypothesis: str, label: str = None) -> Instance: fields = {} # to make it look like two instances [cls] sent1 [sep] & [cls] sent2 [sep] tokens1 = self._tokenizer.tokenize(text) tokens2 = self._tokenizer.tokenize(hypothesis) fields["tokens1"] = TextField(tokens1, self._token_indexers) fields["tokens2"] = TextField(tokens2, self._token_indexers) if label is not None: fields["labels"] = LabelField(label) return Instance(fields)
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def text_to_instance(self, source_tokens: List[Token], target_tokens: List[Token]) -> Optional[Instance]: source_tokens.insert(0, Token(START_SYMBOL)) source_tokens.append(Token(END_SYMBOL)) target_tokens.insert(0, Token(START_SYMBOL)) target_tokens.append(Token(END_SYMBOL)) fields = { 'source_tokens': TextField(source_tokens, self.token_indexers), 'target_tokens': TextField(target_tokens, self.target_token_indexers), } return Instance(fields)
def test_registrability(self): @Vocabulary.register('my-vocabulary') class MyVocabulary: @classmethod def from_params(cls, params, instances=None): # pylint: disable=unused-argument return MyVocabulary() params = Params({'type': 'my-vocabulary'}) instance = Instance(fields={}) vocab = Vocabulary.from_params(params=params, instances=[instance]) assert isinstance(vocab, MyVocabulary)
def predict_batch(self, texts): instances = [] for text in texts: tokens = self._tokenizer.tokenize(text) instance = Instance( {'tokens': TextField(tokens, self._token_indexers)}) instances.append(instance) result = self.model.forward_on_instances(instances) results = [] for instance_result in result: results.append(self._format_instance_result(instance_result)) return results
def toInstance(self, names: List[str], categories: List[str] = None) -> Instance: token_field = TextField([Token(nm) for nm in names], self.token_indexers) fields = {"tokens": token_field} fields["token_characters"] = TextField([Token(nm) for nm in names], self.char_indexers) if categories: fields["labels"] = SequenceLabelField(labels=categories, sequence_field=token_field) return Instance(fields)
def test_registrability(self): @Vocabulary.register("my-vocabulary") class MyVocabulary: @classmethod def from_params(cls, params, instances=None): return MyVocabulary() params = Params({"type": "my-vocabulary"}) instance = Instance(fields={}) vocab = Vocabulary.from_params(params=params, instances=[instance]) assert isinstance(vocab, MyVocabulary)
def get_dataset(self): field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer) field2 = TextField([ Token(t) for t in ["this", "is", "a", "different", "sentence", "."] ], self.token_indexer) field3 = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [ Instance({ "text1": field1, "text2": field2 }), Instance({ "text1": field3, "text2": field4 }) ] return Dataset(instances)
def text_to_instance(self, src_seq: Iterable[Token], tgt_seq: str, salience_seq: Iterable[float]) -> Instance: indexer = SingleIdTokenIndexer(lowercase_tokens=True) tokenized_src = src_seq[:self._source_max_tokens] tokenized_tgt = self._tokenizer.tokenize(tgt_seq)[:self._target_max_tokens] source_field = TextField(tokenized_src, {'tokens': indexer}) target_field = TextField(tokenized_tgt, {'tokens': indexer}) if self._interpolation: saliency_field = ArrayField(np.array(self.smooth_and_norm(salience_seq)[:self._source_max_tokens])) else: saliency_field = ArrayField(np.array(salience_seq[:self._source_max_tokens])) return Instance({ 'source_tokens': source_field, 'target_tokens': target_field, 'salience_values': saliency_field })
def text_to_instance(self, string: str, lang2_lang: str) -> Instance: # type: ignore # pylint: disable=arguments-differ """ Used in predicition time """ lang_pair = self._undefined_lang_id + '-' + lang2_lang tokenized_string = self._tokenizer.tokenize(string) string_field = TextField(tokenized_string, self._token_indexers) return Instance({ self._mingler.dataset_name_field: MetadataField(lang_pair), 'lang1_tokens': string_field })
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() dataset = AllennlpDataset(instances, vocab) # Now finally we can iterate through batches. loader = DataLoader(dataset, 3) for i, batch in enumerate(loader): lm_embeddings = elmo_bilm( batch["elmo"]["character_ids"]["elmo_tokens"]) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings["activations"][2], lm_embeddings["mask"]) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [ expected_lm_embeddings[k][i] for k in range(3) ] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6, ))
def text_to_instance( self, # type: ignore qid: str, start: str, alternatives: List[str], label: Optional[int] = None, ) -> Instance: # tokenize start = self._tokenizer.tokenize(start) sequences = [] for alternative in alternatives: alternative = self._tokenizer.tokenize(alternative) length_for_start = (self.length_limit - len(alternative) - self._tokenizer.num_special_tokens_for_pair()) if length_for_start < 0: # If the alternative is too long by itself, we take the beginning and add no tokens from the start. alternative = alternative[:length_for_start] length_for_start = 0 sequences.append( self._tokenizer.add_special_tokens(start[:length_for_start], alternative)) # make fields from allennlp.data.fields import TextField sequences = [ TextField(sequence, self._token_indexers) for sequence in sequences ] from allennlp.data.fields import ListField sequences = ListField(sequences) from allennlp.data.fields import MetadataField fields = { "alternatives": sequences, "qid": MetadataField(qid), } if label is not None: if label < 0 or label >= len(sequences): raise ValueError("Alternative %d does not exist", label) from allennlp.data.fields import IndexField fields["correct_alternative"] = IndexField(label, sequences) return Instance(fields)
def text_to_instance(self, sup_sents: List[str], sup_labels: List[str] = None, ori_unsup_sents = None, aug_unsup_sents = None) -> Instance: fields: Dict[str, Field] = {} tokenized_sents = [self._tokenizer.tokenize(sent) for sent in sup_sents] sentence_sequence = ListField([TextField(tk, self._token_indexers) for tk in tokenized_sents]) fields['sentences'] = sentence_sequence if sup_labels is not None: fields['labels'] = SequenceLabelField(sup_labels, sentence_sequence) ori_name = 'ori_unsup_sentences_' aug_name = 'aug_unsup_sentences_' if ori_unsup_sents is not None and aug_unsup_sents is not None: # Create TextField for each in ori_unsup_sentences # ori_unsup_tokenized_sents = [self._tokenizer.tokenize(sent) for sent in ori_unsup_sents] # ori_unsup_sentence_sequence = ListField([TextField(tk, self._token_indexers) for tk in ori_unsup_tokenized_sents]) ori_unsup_sentence_sequences = [] for ori_unsup_sent in ori_unsup_sents: ori_unsup_tokenized_sent = [self._tokenizer.tokenize(sent) for sent in ori_unsup_sent] ori_unsup_sentence_sequence = ListField([TextField(tk, self._token_indexers) for tk in ori_unsup_tokenized_sent]) ori_unsup_sentence_sequences.append(ori_unsup_sentence_sequence) # Create TextField for aug_unsup_sentences # aug_unsup_tokenized_sents = [self._tokenizer.tokenize(sent) for sent in aug_unsup_sents] # aug_unsup_sentence_sequence = ListField([TextField(tk, self._token_indexers) for tk in aug_unsup_tokenized_sents]) aug_unsup_sentence_sequences = [] for aug_unsup_sent in aug_unsup_sents: aug_unsup_tokenized_sent = [self._tokenizer.tokenize(sent) for sent in aug_unsup_sent] aug_unsup_sentence_sequence = ListField([TextField(tk, self._token_indexers) for tk in aug_unsup_tokenized_sent]) aug_unsup_sentence_sequences.append(aug_unsup_sentence_sequence) for i in range(len(ori_unsup_sentence_sequences)): fields[ori_name + str(i)] = ori_unsup_sentence_sequences[i] fields[aug_name + str(i)] = aug_unsup_sentence_sequences[i] # This wont work because iterator in AllenNLP requires fields to be ListField to use BERT vocab on # fields['ori_unsup_sentences'] = ori_unsup_sentence_sequences # List # fields['aug_unsup_sentences'] = aug_unsup_sentence_sequences # List # Fake data # fields['ori_unsup_sentences'] = sentence_sequence # fields['aug_unsup_sentences'] = sentence_sequence return Instance(fields)
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in izip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {u'character_ids': indexer}) instance = Instance({u"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. iterator = BasicIterator(3) iterator.index_with(vocab) for i, batch in enumerate( iterator(instances, num_epochs=1, shuffle=False)): lm_embeddings = elmo_bilm(batch[u'elmo'][u'character_ids']) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings[u'activations'][2], lm_embeddings[u'mask']) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [ expected_lm_embeddings[k][i] for k in range(3) ] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6))
def text_to_instance(self, mention_uniq_id, data=None) -> Instance: l_tokenized = [Token('[CLS]')] l_tokenized += [ Token(split_token) for split_token in self.custom_tokenizer_class.tokenize( txt=data['l']) ][:self.config.max_mention_length] l_tokenized.append(Token('[SEP]')) r_tokenized = [Token('[CLS]')] r_tokenized += [ Token(split_token) for split_token in self.custom_tokenizer_class.tokenize( txt=data['r']) ][:self.config.max_mention_length] r_tokenized.append(Token('[SEP]')) l_plus_r = [Token('[CLS]')] l_plus_r += [ Token(split_token) for split_token in self.custom_tokenizer_class.tokenize( txt=data['l']) ][:self.config.max_mention_length] l_plus_r += [Token(BOND_TOKEN)] l_plus_r += [ Token(split_token) for split_token in self.custom_tokenizer_class.tokenize( txt=data['r']) ][:self.config.max_mention_length] l_plus_r += [Token('[SEP]')] context_field = TextField(l_tokenized, self.token_indexers) fields = {"l": context_field} fields['lev'] = ArrayField( np.array(Levenshtein.distance(data['l'], data['r']))) # / (max(len(data['l']), len(data['r']))))) fields['r'] = TextField(r_tokenized, self.token_indexers) fields['l_plus_r'] = TextField(l_plus_r, self.token_indexers) fields['label'] = ArrayField(np.array(data['label'])) fields['mention_uniq_id'] = ArrayField(np.array(mention_uniq_id)) fields['subword_match_num'] = ArrayField( np.array( len( set(self.custom_tokenizer_class.tokenize(txt=data['l'])) & set(self.custom_tokenizer_class.tokenize( txt=data['r']))))) return Instance(fields)
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_tokens_to_namespace(["a", "b"], namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField( [Token(t) for t in ["a", "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")} ) text_field2 = TextField( [Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")} ) instances = Batch([Instance({"text1": text_field1, "text2": text_field2})]) # Following 2 should give error: tokens1 is non-padded in original_vocab but not in instances params = Params( { "type": "extend", "directory": vocab_dir, "non_padded_namespaces": [], "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]}, } ) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params( { "type": "extend", "directory": vocab_dir, "non_padded_namespaces": ["tokens1"], "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]}, } ) Vocabulary.from_params(params, instances=instances) # Following 2 should give error: tokens2 is padded in instances but not in original_vocab params = Params( { "type": "extend", "directory": vocab_dir, "non_padded_namespaces": ["tokens1", "tokens2"], "tokens_to_add": {"tokens1": ["a"], "tokens2": ["p"]}, } ) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances)
def text_to_instance(self, prompt: List[List[str]], evidence: List[str], non_evidence: List[str]): fields = { 'comb_evidence': TextField([ Token(x) for x in (['[CLS]'] + prompt[0] + prompt[1] + prompt[2] + ['[SEP]'] + evidence) ], self.token_indexers), 'comb_non_evidence': TextField([ Token(x) for x in (['[CLS]'] + prompt[0] + prompt[1] + prompt[2] + ['[SEP]'] + non_evidence) ], self.token_indexers) } return Instance(fields)
def text_to_instance(x: Tuple[str]) -> Instance: source_string = x[0] target_string = x[1] tokenized_source = source_tokenizer.tokenize(source_string)[:source_length_limit] tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, source_token_indexers) tokenized_target = target_tokenizer.tokenize(target_string)[:target_length_limit] tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, target_token_indexers) return Instance({source_field_name: source_field, target_field_name: target_field})
def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]): instances = [] indexer = ELMoTokenCharactersIndexer() indexer2 = SingleIdTokenIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer, 'tokens': indexer2}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary.from_instances(instances) dataset.index_instances(vocab) return vocab, dataset.as_tensor_dict()["elmo"]
def batch_to_ids(batch): """ Given a batch (as list of tokenized sentences), return a batch of padded character ids. """ instances = [] for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, title: str = None) -> Instance: tokens_field = TextField(tokens, self.token_indexers) fields = {"tokens": tokens_field} if tags: tags_field = SequenceLabelField(labels=tags, sequence_field=tokens_field) fields["tags"] = tags_field if title: fields["title"] = MetadataField(title) return Instance(fields)
def _make_instance(sent_): """ Forward targs adds <s> as a target for input </s> and bwd targs adds </s> as a target for input <s> to avoid issues with needing to strip extra tokens in the input for each direction """ d = { "input": sentence_to_text_field(sent_, indexers), "targs": sentence_to_text_field(sent_[1:] + [sent_[0]], self.target_indexer), "targs_b": sentence_to_text_field([sent_[-1]] + sent_[:-1], self.target_indexer), } return Instance(d)
def predictions_to_labeled_instances(self, instance: Instance, outputs: Dict[str, numpy.ndarray]): new_instance = instance.duplicate() token_field: TextField = instance["tokens"] # type: ignore mask_targets = [ Token(target_top_k_text[0], text_id=target_top_id_id) for (target_top_k_text, target_top_id_id ) in zip(outputs["words"], outputs["token_ids"]) ] new_instance.add_field( "target_ids", TextField(mask_targets, token_field._token_indexers), vocab=self._model.vocab, ) return [new_instance]
def text_to_instance(self, tokens: List[Token], instance_id: int = -1, labels: int = 0) -> Instance: sentence_field = TextField(tokens, self.token_indexers) label_field = LabelField(labels) id_field = MetadataField(instance_id) fields = { "tokens": sentence_field, 'label': label_field, "instance_id": id_field } return Instance(fields)
def text_to_instance(self, tokens: List[Token], label: int = None, id: str = None) -> Instance: fields = {} sentence_field = TextField(tokens=tokens, token_indexers=self.token_indexers) fields['sentence'] = sentence_field id_field = MetadataField(id) fields['id'] = id_field label_field = LabelField(label=label, skip_indexing=True) fields['labels'] = label_field return Instance(fields=fields)
def text_to_instance(self, tokens, label): """Build text and label field and convert tokens to an ``Instance``. Arguments: tokens {List[str]} -- tokens label {str} -- a label Returns: {Instance} -- a data instance """ sentence_field = TextField(tokens, self.token_indexers) label_field = LabelField(label=label) fields = {"text": sentence_field, "labels": label_field} return Instance(fields)
def text_to_instance(self, tokenized_sentence: List[str], spans: List[List[int]]) -> Instance: allennlp_sentence_tokens = [Token(text=t) for t in tokenized_sentence] sentence_token_indexes = TextField(allennlp_sentence_tokens, self._token_indexers) span_fields = [] for span_start, span_end_exclusive in spans: span_field = SpanField(span_start, span_end_exclusive - 1, sentence_token_indexes) span_fields.append(span_field) fields: Dict[str, Field] = {} fields["tokens"] = sentence_token_indexes fields["spans"] = ListField(span_fields) return Instance(fields)
def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance: if len(tokens) > self._max_token_len: tokens = tokens[:self._max_token_len] print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...') if tags: tags = tags[:self._max_token_len] fields = {} text_field = TextField(tokens, self._token_indexers) fields['tokens'] = text_field if tags: fields['tags'] = SequenceLabelField(tags, text_field) return Instance(fields)
def create_instance(self, str_tokens: List[str]): tokens = [Token(t) for t in str_tokens] instance = Instance({'text': TextField(tokens, self.token_indexers)}) instance.index_fields(self.vocab) return instance