예제 #1
0
    def tag(self, text_field: TextField) -> Dict[str, Any]:
        """
        Perform inference on a TextField to produce predicted tags and class probabilities
        over the possible tags.

        Parameters
        ----------
        text_field : ``TextField``, required.
            A ``TextField`` containing the text to be tagged.

        Returns
        -------
        A Dict containing:

        tags : List[str]
            A list the length of the text input, containing the predicted (argmax) tag
            from the model per token.
        class_probabilities : numpy.Array
            An array of shape (text_input_length, num_classes), where each row is a
            distribution over classes for a given token in the sentence.
        """
        text_field.index(self.vocab)
        padding_lengths = text_field.get_padding_lengths()
        array_input = text_field.as_array(padding_lengths)
        # TODO(Mark): Generalise how the array is transformed into a variable after settling the data API.
        # Add a batch dimension by unsqueezing, because pytorch
        # doesn't support inputs without one.
        array_input = {
            "tokens":
            torch.autograd.Variable(torch.LongTensor(
                array_input["tokens"])).unsqueeze(0)
        }
        output_dict = self.forward(tokens=array_input)

        # Remove batch dimension, as we only had one input.
        predictions = output_dict["class_probabilities"].data.squeeze(0)
        _, argmax = predictions.max(-1)
        indices = argmax.squeeze(1).numpy()
        tags = [
            self.vocab.get_token_from_index(x, namespace="tags")
            for x in indices
        ]

        return {"tags": tags, "class_probabilities": predictions.numpy()}
예제 #2
0
    def test_as_array_produces_token_array(self):
        indexer = SpacyTokenIndexer()
        nlp = get_spacy_model("en_core_web_sm", parse=False, ner=False)
        tokens = [t for t in nlp("This is a sentence.")]
        field = TextField(tokens, token_indexers={"spacy": indexer})

        vocab = Vocabulary()
        field.index(vocab)

        # Indexer functionality
        array_dict = indexer.tokens_to_indices(tokens, vocab)
        assert len(array_dict["tokens"]) == 5
        assert len(array_dict["tokens"][0]) == 96

        # Check it also works with field
        lengths = field.get_padding_lengths()
        array_dict = field.as_tensor(lengths)

        assert list(array_dict["spacy"]["tokens"].shape) == [5, 96]