コード例 #1
0
    def forward_on_instance(
            self,
            instance: Instance,
            cuda_device: int,
            calculate_loss: bool = True) -> Dict[str, numpy.ndarray]:
        """
        Takes an :class:`~allennlp.data.instance.Instance`, which typically has raw text in it,
        converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays
        through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any ``torch.autograd.Variables``
        or ``torch.Tensors`` into numpy arrays and remove the batch dimension.
        """
        instance.index_fields(self.vocab)
        model_input = arrays_to_variables(instance.as_array_dict(),
                                          add_batch_dimension=True,
                                          cuda_device=cuda_device,
                                          for_training=False)
        forward_tensors = self.forward(**model_input,
                                       calculate_loss=calculate_loss)
        outputs = self.decode(forward_tensors)

        for name, output in list(outputs.items()):
            output = output[0]
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            elif isinstance(output, torch.Tensor):
                output = output.cpu().numpy()
            outputs[name] = output
        return outputs
コード例 #2
0
 def visualize_instance(self, instance: Instance):
     """
     main function of this visualizer
     usage: take an instance and visualize it
     _model have to support "return_attention=True" kwarg
     _model have to have the correct vocab in it
     """
     logger = logging.getLogger(__name__)
     # indexing with model
     instance.index_fields(self._model.vocab)
     # get tokens from instance
     json_dict = instance2json(instance)
     tokens_p = json_dict["sentence1"]
     tokens_h = json_dict["sentence2"]
     gold_label = json_dict["gold_label"]
     # get predictions and attentions
     batch = Batch([instance])
     batch_tensor = batch.as_tensor_dict()
     #print(batch.as_tensor_dict())
     ret = self._model.forward(**batch_tensor, return_attention=True)
     ret = self._model.make_output_human_readable(ret)
     pooler_p = ret["attentions"]["pooler1"][0]
     pooler_h = ret["attentions"]["pooler2"][0]
     logger.setLevel(logging.DEBUG)
     logger.info(f"tokens_p are {tokens_p}")
     logger.info(f"tokens_h are {tokens_h}")
     logger.info(f"the predicted label is {ret['predicted_label']}")
     logger.info(f"the gold label is {gold_label}")
     # hope to show_sequence_attention(strlist, att, msg=None)
     show_sequence_attention(tokens_p, pooler_p)
     show_sequence_attention(tokens_h, pooler_h)
     return
コード例 #3
0
ファイル: bidaf.py プロジェクト: sbhaktha/allennlp
    def predict_span(self, question: TextField, passage: TextField) -> Dict[str, Any]:
        """
        Given a question and a passage, predicts the span in the passage that answers the question.

        Parameters
        ----------
        question : ``TextField``
        passage : ``TextField``
            A ``TextField`` containing the tokens in the passage.  Note that we typically add
            ``SquadReader.STOP_TOKEN`` as the final token in the passage, because we use exclusive
            span indices.  Be sure you've added that to the passage you pass in here.

        Returns
        -------
        A Dict containing:

        span_start_probs : numpy.ndarray
        span_end_probs : numpy.ndarray
        best_span : (int, int)
        """
        instance = Instance({'question': question, 'passage': passage})
        instance.index_fields(self.vocab)
        model_input = util.arrays_to_variables(instance.as_array_dict(),
                                               add_batch_dimension=True,
                                               for_training=False)
        output_dict = self.forward(**model_input)

        # Here we're just removing the batch dimension and converting things to numpy arrays /
        # tuples instead of pytorch variables.
        return {
                "span_start_probs": output_dict["span_start_probs"].data.squeeze(0).cpu().numpy(),
                "span_end_probs": output_dict["span_end_probs"].data.squeeze(0).cpu().numpy(),
                "best_span": tuple(output_dict["best_span"].data.squeeze(0).cpu().numpy()),
                }
コード例 #4
0
ファイル: model.py プロジェクト: xumx/allennlp
    def forward_on_instance(self,
                            instance: Instance) -> Dict[str, numpy.ndarray]:
        """
        Takes an :class:`~allennlp.data.instance.Instance`, which typically has raw text in it,
        converts that text into arrays using this model's :class:`Vocabulary`, passes those arrays
        through :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any ``torch.autograd.Variables``
        or ``torch.Tensors`` into numpy arrays and remove the batch dimension.
        """
        # Hack to see what cuda device the model is on, so we know where to put these inputs.  For
        # complicated models, or machines with multiple GPUs, this will not work.  I couldn't find
        # a way to actually query what device a tensor / parameter is on.
        cuda_device = 0 if next(self.parameters()).is_cuda else -1
        instance.index_fields(self.vocab)
        model_input = arrays_to_variables(instance.as_array_dict(),
                                          add_batch_dimension=True,
                                          cuda_device=cuda_device,
                                          for_training=False)
        outputs = self.decode(self.forward(**model_input))

        for name, output in list(outputs.items()):
            output = output[0]
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            outputs[name] = output
        return outputs
コード例 #5
0
    def predict_entailment(self, premise: TextField,
                           hypothesis: TextField) -> Dict[str, torch.Tensor]:
        """
        Given a premise and a hypothesis sentence, predict the entailment relationship between
        them.

        Parameters
        ----------
        premise : ``TextField``
        hypothesis : ``TextField``

        Returns
        -------
        A Dict containing:

        label_probs : torch.FloatTensor
            A tensor of shape ``(num_labels,)`` representing probabilities of the entailment label.
        """
        instance = Instance({"premise": premise, "hypothesis": hypothesis})
        instance.index_fields(self._vocab)
        model_input = arrays_to_variables(instance.as_array_dict(),
                                          add_batch_dimension=True,
                                          for_training=False)
        output_dict = self.forward(**model_input)

        # Remove batch dimension, as we only had one input.
        label_probs = output_dict["label_probs"].data.squeeze(0)
        return {'label_probs': label_probs.numpy()}
コード例 #6
0
    def predict_entailment(self, premise: TextField,
                           hypothesis: TextField) -> Dict[str, torch.Tensor]:
        """
        Given a premise and a hypothesis sentence, predict the entailment relationship between
        them.  Note that in the paper, a null token was appended to each sentence, to allow for
        words to align to nothing in the other sentence.  If you've trained your model with a null
        token, you probably want to include it here, too.

        Parameters
        ----------
        premise : ``TextField``
        hypothesis : ``TextField``

        Returns
        -------
        A Dict containing:

        label_probs : torch.FloatTensor
            A tensor of shape ``(num_labels,)`` representing probabilities of the entailment label.
        """
        instance = Instance({"premise": premise, "hypothesis": hypothesis})
        instance.index_fields(self.vocab)
        model_input = arrays_to_variables(instance.as_array_dict(),
                                          add_batch_dimension=True,
                                          for_training=False)
        output_dict = self.forward(**model_input)

        # Remove batch dimension, as we only had one input.
        label_probs = output_dict["label_probs"].data.squeeze(0)
        return {'label_probs': label_probs.numpy()}
コード例 #7
0
ファイル: elmo.py プロジェクト: mhrmm/allennlp
def token_to_elmo_id(token):
    tokens = [Token(token)]
    field = TextField(tokens, {'character_ids': indexer})
    instance = Instance({"elmo": field})
    instances = [instance]
    dataset = Dataset(instances)
    vocab = Vocabulary()
    for instance in dataset.instances:
        instance.index_fields(vocab)
    #dataset.index_instances(vocab) # replaced by above, so that there's no progress bar
    return dataset.as_tensor_dict()['elmo']['character_ids']
コード例 #8
0
ファイル: task.py プロジェクト: shuningjin/discrete-text-rep
 def _make_instance(input1, label):
     """ from multiple types in one column create multiple fields """
     d = {}
     d["sent_str1"] = MetadataField(" ".join(input1))
     input1 = ["<SOS>"] + input1 + ["<EOS>"]
     d["input1"] = sentence_to_text_field(input1, indexers)
     d["labels"] = LabelField(
         label, label_namespace="labels", skip_indexing=True
     )
     d = Instance(d)
     d.index_fields(vocab)
     return d
コード例 #9
0
 def test_empty_list_can_be_tensorized(self):
     tokenizer = SpacyTokenizer()
     tokens = tokenizer.tokenize("Foo")
     text_field = TextField(tokens, self.word_indexer)
     list_field = ListField([text_field.empty_field()])
     fields = {
         "list": list_field,
         "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer),
     }
     instance = Instance(fields)
     instance.index_fields(self.vocab)
     instance.as_tensor_dict()
コード例 #10
0
    def tag(self, text_field: TextField,
            verb_indicator: IndexField) -> Dict[str, Any]:
        """
        Perform inference on a ``Instance`` consisting of a single ``TextField`` representing
        the sentence and an ``IndexField`` representing an optional index into the sentence
        denoting a verbal predicate.

        Returned sequence is the maximum likelihood tag sequence under the constraint that
        the sequence must be a valid BIO sequence.

        Parameters
        ----------
        text_field : ``TextField``, required.
            A ``TextField`` containing the text to be tagged.
        verb_indicator: ``IndexField``, required.
            The index of the verb whose arguments we are labeling.

        Returns
        -------
        A Dict containing:

        tags : List[str]
            A list the length of the text input, containing the predicted (argmax) tag
            from the model per token.
        class_probabilities : numpy.Array
            An array of shape (text_input_length, num_classes), where each row is a
            distribution over classes for a given token in the sentence.
        """
        instance = Instance({
            "tokens": text_field,
            "verb_indicator": verb_indicator
        })
        instance.index_fields(self.vocab)
        model_input = arrays_to_variables(instance.as_array_dict(),
                                          add_batch_dimension=True,
                                          for_training=False)
        output_dict = self.forward(**model_input)

        # Remove batch dimension, as we only had one input.
        predictions = output_dict["class_probabilities"].data.squeeze(0)
        transition_matrix = self.get_viterbi_pairwise_potentials()

        max_likelihood_sequence, _ = viterbi_decode(predictions,
                                                    transition_matrix)
        tags = [
            self.vocab.get_token_from_index(x, namespace="tags")
            for x in max_likelihood_sequence
        ]

        return {"tags": tags, "class_probabilities": predictions.numpy()}
コード例 #11
0
def batch_to_ids(batch):
    """
    Given a batch (as list of tokenized sentences), return a batch
    of padded character ids.
    """
    instances = []
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Dataset(instances)
    vocab = Vocabulary()
    # dataset.index_instances(vocab)
    for instance in dataset.instances:
        instance.index_fields(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
コード例 #12
0
def get_embeddings(text):
    words = text.split()
    sentence = TextField(
        [Token(x) for x in words],
        token_indexers={
            "tokens": SingleIdTokenIndexer(namespace="token_ids"),
            "characters": TokenCharactersIndexer(namespace="token_characters")
        })

    instance = Instance({"sentence": sentence})
    instances = [instance]
    for instance in instances:
        instance.index_fields(vocab)

    batch = Batch(instances)
    tensors = batch.as_tensor_dict(batch.get_padding_lengths())

    text_field_variables = tensors["sentence"]

    # This will have shape: (batch_size, sentence_length, word_embedding_dim + character_cnn_output_dim)
    embedded_text = text_field_embedder(text_field_variables)
    return embedded_text
コード例 #13
0
    def tag(self, text_field: TextField) -> Dict[str, Any]:
        """
        Perform inference on a TextField to produce predicted tags and class probabilities
        over the possible tags.

        Parameters
        ----------
        text_field : ``TextField``, required.
            A ``TextField`` containing the text to be tagged.

        Returns
        -------
        A Dict containing:

        tags : List[str]
            A list the length of the text input, containing the predicted (argmax) tag
            from the model per token.
        class_probabilities : numpy.Array
            An array of shape (text_input_length, num_classes), where each row is a
            distribution over classes for a given token in the sentence.
        """
        instance = Instance({'tokens': text_field})
        instance.index_fields(self.vocab)
        model_input = arrays_to_variables(instance.as_array_dict(),
                                          add_batch_dimension=True,
                                          for_training=False)
        output_dict = self.forward(**model_input)

        # Remove batch dimension, as we only had one input.
        predictions = output_dict["class_probabilities"].data.squeeze(0)
        _, argmax = predictions.max(-1)
        indices = argmax.numpy()
        tags = [
            self.vocab.get_token_from_index(x, namespace="labels")
            for x in indices
        ]

        return {"tags": tags, "class_probabilities": predictions.numpy()}
コード例 #14
0
 def create_instance(self, str_tokens: List[str]):
     tokens = [Token(t) for t in str_tokens]
     instance = Instance({'text': TextField(tokens, self.token_indexers)})
     instance.index_fields(self.vocab)
     return instance
コード例 #15
0
 def create_instance(self, str_tokens: List[str]):
     tokens = [Token(t) for t in str_tokens]
     instance = Instance({'text': TextField(tokens, self.token_indexers)})
     instance.index_fields(self.vocab)
     return instance
コード例 #16
0
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.seq2vec_encoders import CnnEncoder
from allennlp.data import Instance

import torch

# need a function which will return all the text in the dataset and save it to 'text' variable
# for now let us take this example from the dataset
text = "The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7. In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,? the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.To ensure that no one skipped office, an attendance report was to be sent to the government the next evening.The two notifications ? one mandating the celebration of Rakshabandhan (left) and the other withdrawing the mandate (right) ? were issued by the Daman and Diu administration a day apart. The circular was withdrawn through a one-line order issued late in the evening by the UT?s department of personnel and administrative reforms.?The circular is ridiculous. There are sensitivities involved. How can the government dictate who I should tie rakhi to? We should maintain the professionalism of a workplace? an official told Hindustan Times earlier in the day. She refused to be identified.The notice was issued on Daman and Diu administrator and former Gujarat home minister Praful Kodabhai Patel?s direction, sources said.Rakshabandhan, a celebration of the bond between brothers and sisters, is one of several Hindu festivities and rituals that are no longer confined of private, family affairs but have become tools to push politic al ideologies.In 2014, the year BJP stormed to power at the Centre, Rashtriya Swayamsevak Sangh (RSS) chief Mohan Bhagwat said the festival had ?national significance? and should be celebrated widely ?to protect Hindu culture and live by the values enshrined in it?. The RSS is the ideological parent of the ruling BJP.Last year, women ministers in the Modi government went to the border areas to celebrate the festival with soldiers. A year before, all cabinet ministers were asked to go to their constituencies for the festival."

words = text.split()
sentence = TextField([Token(x) for x in words], token_indexers={"tokens": SingleIdTokenIndexer(namespace="token_ids"), "characters": TokenCharactersIndexer(namespace="token_characters")})

instance = Instance({"sentence": sentence})
instances = [instance]

vocab = Vocabulary.from_instances(instances)

for instance in instances:
    instance.index_fields(vocab)

word_embedding = Embedding(num_embeddings=vocab.get_vocab_size("token_ids"), embedding_dim=10)
char_embedding = Embedding(num_embeddings=vocab.get_vocab_size("token_characters"), embedding_dim=5)
character_cnn = CnnEncoder(embedding_dim=5, num_filters=2, output_dim=8)

# saving everything
vocab.save_to_files("vocab")
torch.save(character_cnn, "character_cnn.pt")
torch.save(word_embedding, "word_embedding.pt")
torch.save(char_embedding, "char_embedding.pt")
コード例 #17
0
 def create_instance(self, tokens: List[str]):
     instance = Instance({'text': TextField(tokens, self.token_indexers)})
     instance.index_fields(self.vocab)
     return instance