Exemple #1
0
    def add_label(
        self,
        instance: Instance,
        label: Union[List[str], List[int], str, int],
        to_field: str = "label",
    ) -> Optional[Instance]:
        """Includes the label field for classification into the instance data"""
        # "if not label:" fails for ndarrays this is why we explicitly check None
        if label is None:
            return instance

        field = None
        # check if multilabel and if adequate type
        if self._multilabel and isinstance(label, (list, numpy.ndarray)):
            label = label.tolist() if isinstance(label, numpy.ndarray) else label
            field = MultiLabelField(label, label_namespace=vocabulary.LABELS_NAMESPACE)
        # check if not multilabel and adequate type + check for empty strings
        if not self._multilabel and isinstance(label, (str, int)) and label:
            field = LabelField(label, label_namespace=vocabulary.LABELS_NAMESPACE)
        if not field:
            # We have label info but we cannot build the label field --> discard the instance
            return None

        instance.add_field(to_field, field)
        return instance
Exemple #2
0
    def _add_label(
        self,
        instance: Instance,
        label: Union[List[str], List[int], str, int],
        to_field: str = "label",
    ) -> Instance:
        """Adds the label field for classification into the instance data

        Helper function for the child's `self.featurize` method.

        Parameters
        ----------
        instance
            Add a label field to this instance
        label
            The label data
        to_field
            Name space of the field

        Returns
        -------
        instance
            If `label` is not None, return `instance` with the a label field added.
            Otherwise return just the given `instance`.

        Raises
        ------
        FeaturizeError
            If the label is an empty string or does not match the type:
            - (str, int) for single label
            - (list, np.array) for multi label
        """
        # "if not label:" fails for ndarrays, this is why we explicitly check for None
        if label is None:
            return instance

        field = None
        # check if multilabel and if adequate type
        if self._multilabel and isinstance(label, (list, numpy.ndarray)):
            label = label.tolist() if isinstance(label,
                                                 numpy.ndarray) else label
            field = MultiLabelField(
                label, label_namespace=vocabulary.LABELS_NAMESPACE)
        # check if not multilabel and adequate type + check for empty strings
        if not self._multilabel and isinstance(label, (str, int)) and label:
            field = LabelField(label,
                               label_namespace=vocabulary.LABELS_NAMESPACE)
        if not field:
            # We have label info but we cannot build the label field --> discard the instance
            raise FeaturizeError(
                f"Cannot create label field for `label={label}`!")

        instance.add_field(to_field, field)

        return instance
Exemple #3
0
    def text_to_instance(self, line: str) -> Optional[Instance]:
        tokens = []
        tags = []
        toks_tags = self.tokenizer.tokenize(line)
        if not toks_tags:
            return None
        for tok_tag in toks_tags:
            tok, *tag = tok_tag.text.split(self._word_tag_delimiter)
            tokens.append(Token(tok))
            tags.append(tag or UNK)

        inst = Instance({'tokens': TextField(tokens, {})})
        inst.add_field('tags', SequenceLabelField(tags, inst['tokens']))
        return inst
Exemple #4
0
    def test_duplicate(self):
        # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in
        # a `TextField`. See https://github.com/allenai/allennlp/issues/4270.
        instance = Instance({
            "words":
            TextField(
                [Token("hello")],
                {"tokens": PretrainedTransformerIndexer("bert-base-uncased")})
        })

        other = instance.duplicate()
        assert other == instance

        # Adding new fields to the original instance should not effect the duplicate.
        instance.add_field("labels", LabelField("some_label"))
        assert "labels" not in other.fields
        assert other != instance  # sanity check on the '__eq__' method.
Exemple #5
0
    def predict(self, inputs: JsonDict) -> JsonDict:
        qg_instances = list(self._question_model_dataset_reader.sentence_json_to_instances(inputs, verbs_only = True))
        qa_instances = list(self._question_to_span_model_dataset_reader.sentence_json_to_instances(inputs, verbs_only = True))
        if self._tan_model is not None:
            tan_instances = list(self._tan_model_dataset_reader.sentence_json_to_instances(inputs, verbs_only = True))
            tan_outputs = self._tan_model.forward_on_instances(tan_instances)
        else:
            tan_outputs = [None for _ in qg_instances]
        if self._span_to_tan_model is not None:
            span_to_tan_instances = list(self._span_to_tan_model_dataset_reader.sentence_json_to_instances(inputs, verbs_only = True))
        else:
            span_to_tan_instances = [None for _ in qg_instances]
        if self._animacy_model is not None:
            animacy_instances = list(self._animacy_model_dataset_reader.sentence_json_to_instances(inputs, verbs_only = True))
        else:
            animacy_instances = [None for _ in qg_instances]

        verb_dicts = []
        for (qg_instance, qa_instance_template, tan_output, span_to_tan_instance, animacy_instance) in zip(qg_instances, qa_instances, tan_outputs, span_to_tan_instances, animacy_instances):
            qg_instance.index_fields(self._question_model.vocab)
            qgen_input_tensors = move_to_device(
                Batch([qg_instance]).as_tensor_dict(),
                self._question_model._get_prediction_device())
            _, all_question_slots, question_probs = self._question_model.beam_decode(
                text = qgen_input_tensors["text"],
                predicate_indicator = qgen_input_tensors["predicate_indicator"],
                predicate_index = qgen_input_tensors["predicate_index"],
                max_beam_size = self._question_beam_size,
                min_beam_probability = self._question_minimum_threshold,
                clause_mode = self._clause_mode)

            verb_qa_instances = []
            question_slots_list = []
            for i in range(len(question_probs)):
                qa_instance = Instance({k: v for k, v in qa_instance_template.fields.items()})
                question_slots = {}
                for slot_name in self._question_to_span_model.get_slot_names():
                    slot_label = all_question_slots[slot_name][i]
                    question_slots[slot_name] = slot_label
                    slot_label_field = LabelField(slot_label, get_slot_label_namespace(slot_name))
                    qa_instance.add_field(slot_name, slot_label_field, self._question_to_span_model.vocab)
                question_slots_list.append(question_slots)
                verb_qa_instances.append(qa_instance)
            if len(verb_qa_instances) > 0:
                qa_outputs = self._question_to_span_model.forward_on_instances(verb_qa_instances)
                if self._animacy_model is not None or self._span_to_tan_model is not None:
                    all_spans = list(set([s for qa_output in qa_outputs for s, p in qa_output["spans"] if p >= self._span_minimum_threshold]))
                if self._animacy_model is not None:
                    animacy_instance.add_field("animacy_spans", ListField([SpanField(s.start(), s.end(), animacy_instance["text"]) for s in all_spans]), self._animacy_model.vocab)
                    animacy_output = self._animacy_model.forward_on_instance(animacy_instance)
                else:
                    animacy_output = None
                if self._span_to_tan_model is not None:
                    span_to_tan_instance.add_field("tan_spans", ListField([SpanField(s.start(), s.end(), span_to_tan_instance["text"]) for s in all_spans]))
                    span_to_tan_output = self._span_to_tan_model.forward_on_instance(span_to_tan_instance)
                else:
                    span_to_tan_output = None
            else:
                qa_outputs = []
                animacy_output = None
                span_to_tan_output = None

            qa_beam = []
            for question_slots, question_prob, qa_output in zip(question_slots_list, question_probs, qa_outputs):
                scored_spans = [(s, p) for s, p in qa_output["spans"] if p >= self._span_minimum_threshold]
                invalid_dict = {}
                if self._question_to_span_model.classifies_invalids():
                    invalid_dict["invalidProb"] = qa_output["invalid_prob"].item()
                for span, span_prob in scored_spans:
                    qa_beam.append({
                        "questionSlots": question_slots,
                        "questionProb": question_prob,
                        **invalid_dict,
                        "span": [span.start(), span.end() + 1],
                        "spanProb": span_prob
                    })
            beam = { "qa_beam": qa_beam }
            if tan_output is not None:
                beam["tans"] = [
                    (self._tan_model.vocab.get_token_from_index(i, namespace = "tan-string-labels"), p)
                    for i, p in enumerate(tan_output["probs"].tolist())
                    if p >= self._tan_minimum_threshold
                ]
            if animacy_output is not None:
                beam["animacy"] = [
                    ([s.start(), s.end() + 1], p)
                    for s, p in zip(all_spans, animacy_output["probs"].tolist())
                ]
            if span_to_tan_output is not None:
                beam["span_tans"] = [
                    ([s.start(), s.end() + 1], [
                        (self._span_to_tan_model.vocab.get_token_from_index(i, namespace = "tan-string-labels"), p)
                        for i, p in enumerate(probs)
                        if p >= self._tan_minimum_threshold])
                    for s, probs in zip(all_spans, span_to_tan_output["probs"].tolist())
                ]
            verb_dicts.append({
                "verbIndex": qg_instance["metadata"]["verb_index"],
                "verbInflectedForms": qg_instance["metadata"]["verb_inflected_forms"],
                "beam": beam
            })
        return {
            "sentenceId": inputs["sentenceId"],
            "sentenceTokens": inputs["sentenceTokens"],
            "verbs": verb_dicts
        }