def add_label( self, instance: Instance, label: Union[List[str], List[int], str, int], to_field: str = "label", ) -> Optional[Instance]: """Includes the label field for classification into the instance data""" # "if not label:" fails for ndarrays this is why we explicitly check None if label is None: return instance field = None # check if multilabel and if adequate type if self._multilabel and isinstance(label, (list, numpy.ndarray)): label = label.tolist() if isinstance(label, numpy.ndarray) else label field = MultiLabelField(label, label_namespace=vocabulary.LABELS_NAMESPACE) # check if not multilabel and adequate type + check for empty strings if not self._multilabel and isinstance(label, (str, int)) and label: field = LabelField(label, label_namespace=vocabulary.LABELS_NAMESPACE) if not field: # We have label info but we cannot build the label field --> discard the instance return None instance.add_field(to_field, field) return instance
def _add_label( self, instance: Instance, label: Union[List[str], List[int], str, int], to_field: str = "label", ) -> Instance: """Adds the label field for classification into the instance data Helper function for the child's `self.featurize` method. Parameters ---------- instance Add a label field to this instance label The label data to_field Name space of the field Returns ------- instance If `label` is not None, return `instance` with the a label field added. Otherwise return just the given `instance`. Raises ------ FeaturizeError If the label is an empty string or does not match the type: - (str, int) for single label - (list, np.array) for multi label """ # "if not label:" fails for ndarrays, this is why we explicitly check for None if label is None: return instance field = None # check if multilabel and if adequate type if self._multilabel and isinstance(label, (list, numpy.ndarray)): label = label.tolist() if isinstance(label, numpy.ndarray) else label field = MultiLabelField( label, label_namespace=vocabulary.LABELS_NAMESPACE) # check if not multilabel and adequate type + check for empty strings if not self._multilabel and isinstance(label, (str, int)) and label: field = LabelField(label, label_namespace=vocabulary.LABELS_NAMESPACE) if not field: # We have label info but we cannot build the label field --> discard the instance raise FeaturizeError( f"Cannot create label field for `label={label}`!") instance.add_field(to_field, field) return instance
def text_to_instance(self, line: str) -> Optional[Instance]: tokens = [] tags = [] toks_tags = self.tokenizer.tokenize(line) if not toks_tags: return None for tok_tag in toks_tags: tok, *tag = tok_tag.text.split(self._word_tag_delimiter) tokens.append(Token(tok)) tags.append(tag or UNK) inst = Instance({'tokens': TextField(tokens, {})}) inst.add_field('tags', SequenceLabelField(tags, inst['tokens'])) return inst
def test_duplicate(self): # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in # a `TextField`. See https://github.com/allenai/allennlp/issues/4270. instance = Instance({ "words": TextField( [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}) }) other = instance.duplicate() assert other == instance # Adding new fields to the original instance should not effect the duplicate. instance.add_field("labels", LabelField("some_label")) assert "labels" not in other.fields assert other != instance # sanity check on the '__eq__' method.
def predict(self, inputs: JsonDict) -> JsonDict: qg_instances = list(self._question_model_dataset_reader.sentence_json_to_instances(inputs, verbs_only = True)) qa_instances = list(self._question_to_span_model_dataset_reader.sentence_json_to_instances(inputs, verbs_only = True)) if self._tan_model is not None: tan_instances = list(self._tan_model_dataset_reader.sentence_json_to_instances(inputs, verbs_only = True)) tan_outputs = self._tan_model.forward_on_instances(tan_instances) else: tan_outputs = [None for _ in qg_instances] if self._span_to_tan_model is not None: span_to_tan_instances = list(self._span_to_tan_model_dataset_reader.sentence_json_to_instances(inputs, verbs_only = True)) else: span_to_tan_instances = [None for _ in qg_instances] if self._animacy_model is not None: animacy_instances = list(self._animacy_model_dataset_reader.sentence_json_to_instances(inputs, verbs_only = True)) else: animacy_instances = [None for _ in qg_instances] verb_dicts = [] for (qg_instance, qa_instance_template, tan_output, span_to_tan_instance, animacy_instance) in zip(qg_instances, qa_instances, tan_outputs, span_to_tan_instances, animacy_instances): qg_instance.index_fields(self._question_model.vocab) qgen_input_tensors = move_to_device( Batch([qg_instance]).as_tensor_dict(), self._question_model._get_prediction_device()) _, all_question_slots, question_probs = self._question_model.beam_decode( text = qgen_input_tensors["text"], predicate_indicator = qgen_input_tensors["predicate_indicator"], predicate_index = qgen_input_tensors["predicate_index"], max_beam_size = self._question_beam_size, min_beam_probability = self._question_minimum_threshold, clause_mode = self._clause_mode) verb_qa_instances = [] question_slots_list = [] for i in range(len(question_probs)): qa_instance = Instance({k: v for k, v in qa_instance_template.fields.items()}) question_slots = {} for slot_name in self._question_to_span_model.get_slot_names(): slot_label = all_question_slots[slot_name][i] question_slots[slot_name] = slot_label slot_label_field = LabelField(slot_label, get_slot_label_namespace(slot_name)) qa_instance.add_field(slot_name, slot_label_field, self._question_to_span_model.vocab) question_slots_list.append(question_slots) verb_qa_instances.append(qa_instance) if len(verb_qa_instances) > 0: qa_outputs = self._question_to_span_model.forward_on_instances(verb_qa_instances) if self._animacy_model is not None or self._span_to_tan_model is not None: all_spans = list(set([s for qa_output in qa_outputs for s, p in qa_output["spans"] if p >= self._span_minimum_threshold])) if self._animacy_model is not None: animacy_instance.add_field("animacy_spans", ListField([SpanField(s.start(), s.end(), animacy_instance["text"]) for s in all_spans]), self._animacy_model.vocab) animacy_output = self._animacy_model.forward_on_instance(animacy_instance) else: animacy_output = None if self._span_to_tan_model is not None: span_to_tan_instance.add_field("tan_spans", ListField([SpanField(s.start(), s.end(), span_to_tan_instance["text"]) for s in all_spans])) span_to_tan_output = self._span_to_tan_model.forward_on_instance(span_to_tan_instance) else: span_to_tan_output = None else: qa_outputs = [] animacy_output = None span_to_tan_output = None qa_beam = [] for question_slots, question_prob, qa_output in zip(question_slots_list, question_probs, qa_outputs): scored_spans = [(s, p) for s, p in qa_output["spans"] if p >= self._span_minimum_threshold] invalid_dict = {} if self._question_to_span_model.classifies_invalids(): invalid_dict["invalidProb"] = qa_output["invalid_prob"].item() for span, span_prob in scored_spans: qa_beam.append({ "questionSlots": question_slots, "questionProb": question_prob, **invalid_dict, "span": [span.start(), span.end() + 1], "spanProb": span_prob }) beam = { "qa_beam": qa_beam } if tan_output is not None: beam["tans"] = [ (self._tan_model.vocab.get_token_from_index(i, namespace = "tan-string-labels"), p) for i, p in enumerate(tan_output["probs"].tolist()) if p >= self._tan_minimum_threshold ] if animacy_output is not None: beam["animacy"] = [ ([s.start(), s.end() + 1], p) for s, p in zip(all_spans, animacy_output["probs"].tolist()) ] if span_to_tan_output is not None: beam["span_tans"] = [ ([s.start(), s.end() + 1], [ (self._span_to_tan_model.vocab.get_token_from_index(i, namespace = "tan-string-labels"), p) for i, p in enumerate(probs) if p >= self._tan_minimum_threshold]) for s, probs in zip(all_spans, span_to_tan_output["probs"].tolist()) ] verb_dicts.append({ "verbIndex": qg_instance["metadata"]["verb_index"], "verbInflectedForms": qg_instance["metadata"]["verb_inflected_forms"], "beam": beam }) return { "sentenceId": inputs["sentenceId"], "sentenceTokens": inputs["sentenceTokens"], "verbs": verb_dicts }