Esempio n. 1
0
    def create_instance(self, triple: Dict[str, Any], op_vec: Array[float],
                        pc_vec: Array[float]) -> Instance:
        op_tokens = cv.get_tokens(triple["op_selftext"])
        pc_tokens = cv.get_tokens(triple["deltaed_comment"])
        exp_tokens = cv.get_tokens(triple["explanation"])
        input_tokens = op_tokens + [TRANSITION_TOKEN] + pc_tokens

        features = np.concatenate([op_vec, TRANSITION_VEC, pc_vec])
        assert len(features) == len(input_tokens)

        exp_stems = {tok.lemma_ for tok in exp_tokens}
        input_stems = [tok.lemma_ for tok in input_tokens]
        transferred = [int(stem in exp_stems) for stem in input_stems]
        assert len(transferred) == len(features)

        text_field = TextField(input_tokens,
                               token_indexers=self._token_indexers)

        return Instance({
            "tokens":
            text_field,
            "features":
            ArrayField(features),
            "is_transferred":
            SequenceLabelField(transferred, text_field)
        })
Esempio n. 2
0
 def setUp(self):
     token_indexer = SingleIdTokenIndexer("tokens")
     text_field = TextField(["a", "a", "a", "a", "b", "b", "c", "c", "c"],
                            {"tokens": token_indexer})
     self.instance = Instance({"text": text_field})
     self.dataset = Dataset([self.instance])
     super(TestVocabulary, self).setUp()
    def test_token_to_indices_batch_size_2(self):
        # Checks whether or not AllenNLP overwrites padding logic.

        # Test with batch size 2 with different lengths.
        batch_sentences = [
            "I have a good dog called Killi .", "He fetches me stuff ."
        ]

        instances = []
        for sentence in batch_sentences:
            tokens = [Token(token) for token in sentence.split()]
            field = TextField(tokens, {'test_chunky': self.indexer})
            instance = Instance({"elmo_chunky": field})
            instances.append(instance)

        vocab = Vocabulary()
        iterator = BasicIterator()
        iterator.index_with(vocab)

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        assert (batch['elmo_chunky']['mask'] > 0).sum(dim=1).tolist() == [8, 5]
        assert (batch['elmo_chunky']['seg_map'] > -1).sum(dim=1).tolist() == [
            8, 5
        ]
        assert ((batch['elmo_chunky']['character_ids'] > 0).sum(
            dim=2) == 50).sum(dim=1).tolist() == [8, 5]
Esempio n. 4
0
    def text_to_instance(self,
                         chains,
                         labels=None,
                         info=None,
                         entity_info=None) -> Instance:

        if self.debug_mode:
            print("-->> chains = ", chains)
            print("-->> labels = ", labels)
            print("-->> chain3 labels = ",
                  [chain[3]['label'] for chain in chains])

        fields: Dict[str, Field] = {}

        metadata = {
            "id": None,
            "instance_num": self._instance_num,
            "choice_type": None,
            "all_chains": chains,
            "labels": labels,
            "labeltext": None,
            "chain_id": chains[0][3]['chain_id'],
            "info": info,
            "original": chains[0][3].get('original', None)
        }

        self._instance_num += 1
        all_ids = [chain[3]['id'] for chain in chains]
        assert len(all_ids) == 1

        all_chains = [chain[:3] for chain in chains]
        metadata['id'] = all_ids[0]
        metadata['choice_type'] = [
            chain[3]['choice_type'] for chain in chains
        ][0]
        metadata['label'] = labels[
            0]  # [ chain[ 3 ][ 'label' ] for chain in chains ][ 0 ]
        metadata['labeltext'] = [
            chain[3].get('labeltext', None) for chain in chains
        ][0]
        metadata['score'] = [chain[3].get('score', 0.0) for chain in chains]

        all_chains = [SEP.join(chain) for chain in all_chains]
        all_chains = [self._tokenizer.tokenize(chain) for chain in all_chains]
        all_chains = [
            TextField(chain, self._token_indexers) for chain in all_chains
        ]

        all_chains = all_chains[0]
        if labels is not None:
            all_labels = LabelField(str(labels[0]))
        else:
            all_labels = labels
        fields['tokens'] = all_chains
        fields['label'] = all_labels
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields=fields)
Esempio n. 5
0
    def tag(self, text_field: TextField) -> Dict[str, Any]:
        """
        Perform inference on a TextField to produce predicted tags and class probabilities
        over the possible tags.

        Parameters
        ----------
        text_field : ``TextField``, required.
            A ``TextField`` containing the text to be tagged.

        Returns
        -------
        A Dict containing:

        tags : List[str]
            A list the length of the text input, containing the predicted (argmax) tag
            from the model per token.
        class_probabilities : numpy.Array
            An array of shape (text_input_length, num_classes), where each row is a
            distribution over classes for a given token in the sentence.
        """
        text_field.index(self.vocab)
        padding_lengths = text_field.get_padding_lengths()
        array_input = text_field.as_array(padding_lengths)
        # TODO(Mark): Generalise how the array is transformed into a variable after settling the data API.
        # Add a batch dimension by unsqueezing, because pytorch
        # doesn't support inputs without one.
        array_input = {
            "tokens":
            torch.autograd.Variable(torch.LongTensor(
                array_input["tokens"])).unsqueeze(0)
        }
        output_dict = self.forward(tokens=array_input)

        # Remove batch dimension, as we only had one input.
        predictions = output_dict["class_probabilities"].data.squeeze(0)
        _, argmax = predictions.max(-1)
        indices = argmax.squeeze(1).numpy()
        tags = [
            self.vocab.get_token_from_index(x, namespace="tags")
            for x in indices
        ]

        return {"tags": tags, "class_probabilities": predictions.numpy()}
Esempio n. 6
0
 def _predict_sentence(self, model, sentence, text, debug):
     tokens = []
     for item in sentence:
         if 'text' in item:
             token = item['text'].strip()
         else:
             token = text[item['start']:item['end']].strip()
         if token != '':
             tokens.append(Token(token))
     instance = Instance({
         'tokens':
         TextField(tokens, token_indexers={'tokens': self.indexer})
     })
     prediction = model.forward_on_instance(instance)
     sentiment = prediction['sentiment'][0]
     start = sentence[0]['start']
     end = sentence[-1]['end']
     item = {'start': start, 'end': end, 'sentiment': float(sentiment)}
     if debug:
         item['text'] = text[start:end]
     return item
Esempio n. 7
0
 def _predict_sentence(self, model, sentiment_map, sentence, text, debug):
     tokens = []
     for item in sentence:
         if 'text' in item:
             token = item['text'].strip()
         else:
             token = text[item['start']:item['end']].strip()
         if token != '':
             tokens.append(Token(token))
     instance = Instance({
         'tokens':
         TextField(tokens, token_indexers={'tokens': self.indexer})
     })
     prediction = model.forward_on_instance(instance)
     logits = prediction['logits']
     sentiment = sentiment_map[logits.argmax()]
     start = sentence[0]['start']
     end = sentence[-1]['end']
     item = {'start': start, 'end': end, 'sentiment': sentiment}
     if debug:
         item['text'] = text[start:end]
         item['logits'] = logits.astype('float').tolist()
     return item
Esempio n. 8
0
    def test_as_array_produces_token_array(self):
        indexer = SpacyTokenIndexer()
        nlp = get_spacy_model("en_core_web_sm", parse=False, ner=False)
        tokens = [t for t in nlp("This is a sentence.")]
        field = TextField(tokens, token_indexers={"spacy": indexer})

        vocab = Vocabulary()
        field.index(vocab)

        # Indexer functionality
        array_dict = indexer.tokens_to_indices(tokens, vocab)
        assert len(array_dict["tokens"]) == 5
        assert len(array_dict["tokens"][0]) == 96

        # Check it also works with field
        lengths = field.get_padding_lengths()
        array_dict = field.as_tensor(lengths)

        assert list(array_dict["spacy"]["tokens"].shape) == [5, 96]
    def make_reading_comprehension_instance(
            self,
            question_tokens: List[Token],
            passage_tokens: List[Token],
            token_indexers: Dict[str, TokenIndexer],
            passage_text: str,
            token_spans: List[Tuple[int, int]] = None,
            answer_texts: List[str] = None,
            additional_metadata: Dict[str, Any] = None) -> Instance:
        """
        Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
        in a reading comprehension model.

        Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
        ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
        and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
        fields, which are both ``IndexFields``.

        Parameters
        ----------
        question_tokens : ``List[Token]``
            An already-tokenized question.
        passage_tokens : ``List[Token]``
            An already-tokenized passage that contains the answer to the given question.
        token_indexers : ``Dict[str, TokenIndexer]``
            Determines how the question and passage ``TextFields`` will be converted into tensors that
            get input to a model.  See :class:`TokenIndexer`.
        passage_text : ``str``
            The original passage text.  We need this so that we can recover the actual span from the
            original passage that the model predicts as the answer to the question.  This is used in
            official evaluation scripts.
        token_spans : ``List[Tuple[int, int]]``, optional
            Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
            a list because there might be several possible correct answer spans in the passage.
            Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple
            annotations on the dev set; this will select the span that the most annotators gave as
            correct).
        answer_texts : ``List[str]``, optional
            All valid answer strings for the given question.  In SQuAD, e.g., the training set has
            exactly one answer per question, but the dev and test sets have several.  TriviaQA has many
            possible answers, which are the aliases for the known correct entity.  This is put into the
            metadata for use with official evaluation scripts, but not used anywhere else.
        additional_metadata : ``Dict[str, Any]``, optional
            The constructed ``metadata`` field will by default contain ``original_passage``,
            ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
            you want any other metadata to be associated with each instance, you can pass that in here.
            This dictionary will get added to the ``metadata`` dictionary we already construct.
        """
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}

        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        fields['passage'] = passage_field
        fields['question'] = TextField(question_tokens, token_indexers)
        metadata = {
            'original_passage': passage_text,
            'question_tokens': [token.text for token in question_tokens],
            'passage_tokens': [token.text for token in passage_tokens],
        }

        if answer_texts:
            metadata['answer_texts'] = answer_texts

        if token_spans:
            metadata["token_spans"] = token_spans

            # assume spans are sorted by some criteria
            span_start = token_spans[0][0]
            span_end = token_spans[0][1] - 1
            assert (span_start <= span_end)
            if span_end > len(passage_tokens) - 1:
                return None

            fields['span_start'] = IndexField(span_start, passage_field)
            fields['span_end'] = IndexField(span_end, passage_field)

        metadata.update(additional_metadata)
        fields['metadata'] = MetadataField(metadata)

        return Instance(fields)
Esempio n. 10
0
                }, output_json)
            output_json.write("\n")
        return chunk_tags


if __name__ == "__main__":
    from calypso.labeled_seglm_transformer import LabeledSegLMTransformer
    self = ChunkyElmoIndexer(
        "/Users/swabhas/pretrained/log_chunking_ptb_comparable/model.tar.gz",
        "/Users/swabhas/pretrained/log_1b_labeled_seglm_transformer/model.tar.gz"
    )
    sentence = "I have a good dog ."
    tokens = [Token(word) for word in sentence.split()]
    vocabulary = Vocabulary()
    index_name = "test-chunky"

    batch = ["I have a good dog .", "He fetches me stuff ."]

    from allennlp.data.dataset import Batch, Instance
    from allennlp.data.fields.text_field import TextField
    instances = []
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'test_chunky': self})
        instance = Instance({"elmo_chunky": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)