def create_instance(self, triple: Dict[str, Any], op_vec: Array[float], pc_vec: Array[float]) -> Instance: op_tokens = cv.get_tokens(triple["op_selftext"]) pc_tokens = cv.get_tokens(triple["deltaed_comment"]) exp_tokens = cv.get_tokens(triple["explanation"]) input_tokens = op_tokens + [TRANSITION_TOKEN] + pc_tokens features = np.concatenate([op_vec, TRANSITION_VEC, pc_vec]) assert len(features) == len(input_tokens) exp_stems = {tok.lemma_ for tok in exp_tokens} input_stems = [tok.lemma_ for tok in input_tokens] transferred = [int(stem in exp_stems) for stem in input_stems] assert len(transferred) == len(features) text_field = TextField(input_tokens, token_indexers=self._token_indexers) return Instance({ "tokens": text_field, "features": ArrayField(features), "is_transferred": SequenceLabelField(transferred, text_field) })
def setUp(self): token_indexer = SingleIdTokenIndexer("tokens") text_field = TextField(["a", "a", "a", "a", "b", "b", "c", "c", "c"], {"tokens": token_indexer}) self.instance = Instance({"text": text_field}) self.dataset = Dataset([self.instance]) super(TestVocabulary, self).setUp()
def test_token_to_indices_batch_size_2(self): # Checks whether or not AllenNLP overwrites padding logic. # Test with batch size 2 with different lengths. batch_sentences = [ "I have a good dog called Killi .", "He fetches me stuff ." ] instances = [] for sentence in batch_sentences: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {'test_chunky': self.indexer}) instance = Instance({"elmo_chunky": field}) instances.append(instance) vocab = Vocabulary() iterator = BasicIterator() iterator.index_with(vocab) for batch in iterator(instances, num_epochs=1, shuffle=False): break assert (batch['elmo_chunky']['mask'] > 0).sum(dim=1).tolist() == [8, 5] assert (batch['elmo_chunky']['seg_map'] > -1).sum(dim=1).tolist() == [ 8, 5 ] assert ((batch['elmo_chunky']['character_ids'] > 0).sum( dim=2) == 50).sum(dim=1).tolist() == [8, 5]
def text_to_instance(self, chains, labels=None, info=None, entity_info=None) -> Instance: if self.debug_mode: print("-->> chains = ", chains) print("-->> labels = ", labels) print("-->> chain3 labels = ", [chain[3]['label'] for chain in chains]) fields: Dict[str, Field] = {} metadata = { "id": None, "instance_num": self._instance_num, "choice_type": None, "all_chains": chains, "labels": labels, "labeltext": None, "chain_id": chains[0][3]['chain_id'], "info": info, "original": chains[0][3].get('original', None) } self._instance_num += 1 all_ids = [chain[3]['id'] for chain in chains] assert len(all_ids) == 1 all_chains = [chain[:3] for chain in chains] metadata['id'] = all_ids[0] metadata['choice_type'] = [ chain[3]['choice_type'] for chain in chains ][0] metadata['label'] = labels[ 0] # [ chain[ 3 ][ 'label' ] for chain in chains ][ 0 ] metadata['labeltext'] = [ chain[3].get('labeltext', None) for chain in chains ][0] metadata['score'] = [chain[3].get('score', 0.0) for chain in chains] all_chains = [SEP.join(chain) for chain in all_chains] all_chains = [self._tokenizer.tokenize(chain) for chain in all_chains] all_chains = [ TextField(chain, self._token_indexers) for chain in all_chains ] all_chains = all_chains[0] if labels is not None: all_labels = LabelField(str(labels[0])) else: all_labels = labels fields['tokens'] = all_chains fields['label'] = all_labels fields["metadata"] = MetadataField(metadata) return Instance(fields=fields)
def tag(self, text_field: TextField) -> Dict[str, Any]: """ Perform inference on a TextField to produce predicted tags and class probabilities over the possible tags. Parameters ---------- text_field : ``TextField``, required. A ``TextField`` containing the text to be tagged. Returns ------- A Dict containing: tags : List[str] A list the length of the text input, containing the predicted (argmax) tag from the model per token. class_probabilities : numpy.Array An array of shape (text_input_length, num_classes), where each row is a distribution over classes for a given token in the sentence. """ text_field.index(self.vocab) padding_lengths = text_field.get_padding_lengths() array_input = text_field.as_array(padding_lengths) # TODO(Mark): Generalise how the array is transformed into a variable after settling the data API. # Add a batch dimension by unsqueezing, because pytorch # doesn't support inputs without one. array_input = { "tokens": torch.autograd.Variable(torch.LongTensor( array_input["tokens"])).unsqueeze(0) } output_dict = self.forward(tokens=array_input) # Remove batch dimension, as we only had one input. predictions = output_dict["class_probabilities"].data.squeeze(0) _, argmax = predictions.max(-1) indices = argmax.squeeze(1).numpy() tags = [ self.vocab.get_token_from_index(x, namespace="tags") for x in indices ] return {"tags": tags, "class_probabilities": predictions.numpy()}
def _predict_sentence(self, model, sentence, text, debug): tokens = [] for item in sentence: if 'text' in item: token = item['text'].strip() else: token = text[item['start']:item['end']].strip() if token != '': tokens.append(Token(token)) instance = Instance({ 'tokens': TextField(tokens, token_indexers={'tokens': self.indexer}) }) prediction = model.forward_on_instance(instance) sentiment = prediction['sentiment'][0] start = sentence[0]['start'] end = sentence[-1]['end'] item = {'start': start, 'end': end, 'sentiment': float(sentiment)} if debug: item['text'] = text[start:end] return item
def _predict_sentence(self, model, sentiment_map, sentence, text, debug): tokens = [] for item in sentence: if 'text' in item: token = item['text'].strip() else: token = text[item['start']:item['end']].strip() if token != '': tokens.append(Token(token)) instance = Instance({ 'tokens': TextField(tokens, token_indexers={'tokens': self.indexer}) }) prediction = model.forward_on_instance(instance) logits = prediction['logits'] sentiment = sentiment_map[logits.argmax()] start = sentence[0]['start'] end = sentence[-1]['end'] item = {'start': start, 'end': end, 'sentiment': sentiment} if debug: item['text'] = text[start:end] item['logits'] = logits.astype('float').tolist() return item
def test_as_array_produces_token_array(self): indexer = SpacyTokenIndexer() nlp = get_spacy_model("en_core_web_sm", parse=False, ner=False) tokens = [t for t in nlp("This is a sentence.")] field = TextField(tokens, token_indexers={"spacy": indexer}) vocab = Vocabulary() field.index(vocab) # Indexer functionality array_dict = indexer.tokens_to_indices(tokens, vocab) assert len(array_dict["tokens"]) == 5 assert len(array_dict["tokens"][0]) == 96 # Check it also works with field lengths = field.get_padding_lengths() array_dict = field.as_tensor(lengths) assert list(array_dict["spacy"]["tokens"].shape) == [5, 96]
def make_reading_comprehension_instance( self, question_tokens: List[Token], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_spans: List[Tuple[int, int]] = None, answer_texts: List[str] = None, additional_metadata: Dict[str, Any] = None) -> Instance: """ Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use in a reading comprehension model. Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both ``TextFields``; and ``metadata``, a ``MetadataField``. Additionally, if both ``answer_texts`` and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end`` fields, which are both ``IndexFields``. Parameters ---------- question_tokens : ``List[Token]`` An already-tokenized question. passage_tokens : ``List[Token]`` An already-tokenized passage that contains the answer to the given question. token_indexers : ``Dict[str, TokenIndexer]`` Determines how the question and passage ``TextFields`` will be converted into tensors that get input to a model. See :class:`TokenIndexer`. passage_text : ``str`` The original passage text. We need this so that we can recover the actual span from the original passage that the model predicts as the answer to the question. This is used in official evaluation scripts. token_spans : ``List[Tuple[int, int]]``, optional Indices into ``passage_tokens`` to use as the answer to the question for training. This is a list because there might be several possible correct answer spans in the passage. Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple annotations on the dev set; this will select the span that the most annotators gave as correct). answer_texts : ``List[str]``, optional All valid answer strings for the given question. In SQuAD, e.g., the training set has exactly one answer per question, but the dev and test sets have several. TriviaQA has many possible answers, which are the aliases for the known correct entity. This is put into the metadata for use with official evaluation scripts, but not used anywhere else. additional_metadata : ``Dict[str, Any]``, optional The constructed ``metadata`` field will by default contain ``original_passage``, ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys. If you want any other metadata to be associated with each instance, you can pass that in here. This dictionary will get added to the ``metadata`` dictionary we already construct. """ additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = TextField(question_tokens, token_indexers) metadata = { 'original_passage': passage_text, 'question_tokens': [token.text for token in question_tokens], 'passage_tokens': [token.text for token in passage_tokens], } if answer_texts: metadata['answer_texts'] = answer_texts if token_spans: metadata["token_spans"] = token_spans # assume spans are sorted by some criteria span_start = token_spans[0][0] span_end = token_spans[0][1] - 1 assert (span_start <= span_end) if span_end > len(passage_tokens) - 1: return None fields['span_start'] = IndexField(span_start, passage_field) fields['span_end'] = IndexField(span_end, passage_field) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)
}, output_json) output_json.write("\n") return chunk_tags if __name__ == "__main__": from calypso.labeled_seglm_transformer import LabeledSegLMTransformer self = ChunkyElmoIndexer( "/Users/swabhas/pretrained/log_chunking_ptb_comparable/model.tar.gz", "/Users/swabhas/pretrained/log_1b_labeled_seglm_transformer/model.tar.gz" ) sentence = "I have a good dog ." tokens = [Token(word) for word in sentence.split()] vocabulary = Vocabulary() index_name = "test-chunky" batch = ["I have a good dog .", "He fetches me stuff ."] from allennlp.data.dataset import Batch, Instance from allennlp.data.fields.text_field import TextField instances = [] for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'test_chunky': self}) instance = Instance({"elmo_chunky": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab)