def make_instance(self, record, idx, indexers, model_preprocessing_interface) -> Type[Instance]:
        """Convert a single record to an AllenNLP Instance."""
        tokens = record["text"].split()  # already space-tokenized by Moses
        tokens = model_preprocessing_interface.boundary_token_fn(
            tokens
        )  # apply model-appropriate variants of [cls] and [sep].
        text_field = sentence_to_text_field(tokens, indexers)

        d = {}
        d["idx"] = MetadataField(idx)

        d["input1"] = text_field

        d["span1s"] = ListField(
            [self._make_span_field(t["span1"], text_field, 1) for t in record["targets"]]
        )
        if not self.single_sided:
            d["span2s"] = ListField(
                [self._make_span_field(t["span2"], text_field, 1) for t in record["targets"]]
            )

        # Always use multilabel targets, so be sure each label is a list.
        labels = [utils.wrap_singleton_string(t["label"]) for t in record["targets"]]
        d["labels"] = ListField(
            [
                MultiLabelField(
                    label_set, label_namespace=self._label_namespace, skip_indexing=False
                )
                for label_set in labels
            ]
        )
        return Instance(d)
def convert_to_example(record: Dict):
    """Convert an edge probing record to a TensorFlow example.

    The example has the following features:
        - text: single string, the text
        - targets.span1: list of int64, alternating start, end indices
        - targets.span2: (optional), list of int64, as targets.span1
        - targets.label: list of strings (see note below)
        - info: single string, serialized info JSON
        - targets.info: list of strings, serialized info JSON for each target

    Due to the limitations of tf.Example, spans are packed into a single flat
    list of length 2*num_targets containing alternating endpoints: [s0, e0, s1,
    e1, ..., sn, en]. You can get individual spans back with tf.reshape(spans,
    [-1, 2]).

    If examples have multiple labels per target (such as for SPR2), these are
    joined into a single string on spaces:
        label: ["foo", "bar", "baz"] -> "foo bar baz"
    You can use tf.string_split and tf.sparse.to_dense to convert these into an
    array of targets.

    Args:
        record: dict, in edge probing record (JSON) format.

    Returns:
        tf.train.Example with features described above.
    """
    ex = tf.train.Example()
    add_string_feature(ex, "text", record["text"])
    add_string_feature(ex, "info", json.dumps(record.get("info", {})))
    for target in record["targets"]:
        label_string = " ".join(utils.wrap_singleton_string(target["label"]))
        add_string_feature(ex, "targets.label", label_string)
        add_ints_feature(ex, "targets.span1", target["span1"])
        if "span2" in target:
            add_ints_feature(ex, "targets.span2", target["span2"])
        add_string_feature(ex, "target.info",
                           json.dumps(target.get("info", {})))

    # Verify that span2 is either empty or aligned to span1.
    num_span1s = len(ex.features.feature["targets.span1"].int64_list.value)
    num_span2s = len(ex.features.feature["targets.span2"].int64_list.value)
    assert num_span2s == num_span1s or num_span2s == 0
    return ex
Esempio n. 3
0
    def __str__(self):
        buf = io.StringIO()
        text = self._data["text"]
        tokens = text.split()
        buf.write("Text ({:d}): {:s}\n".format(len(tokens), text))

        for t in self._data["targets"]:
            buf.write("\n")
            buf.write("  span1: {}\n".format(
                self.format_span(tokens, *t["span1"])))
            if "span2" in t:
                buf.write("  span2: {}\n".format(
                    self.format_span(tokens, *t["span2"])))
            labels = utils.wrap_singleton_string(t["label"])
            buf.write("  label: ({:d})\t\t {}\n".format(
                len(labels), ", ".join(labels)))
            # Show predictions, if present.
            if "preds" in t:
                buf.write(self._fmt_preds(t["preds"]))

        return buf.getvalue()
Esempio n. 4
0
    def _split_and_flatten_records(self, records: Iterable[Dict]):
        ex_records = []  # long-form example records, minus targets
        tr_records = []  # long-form target records with 'idx' column
        for idx, r in enumerate(records):
            d = {"text": r["text"], "idx": idx}
            d.update(_get_nested_vals(r, "info"))
            d.update(_get_nested_vals(r, "preds"))
            ex_records.append(d)

            for t in r["targets"]:
                d = {
                    "label": utils.wrap_singleton_string(t["label"]),
                    "idx": idx
                }
                if "span1" in t:
                    d["span1"] = tuple(t["span1"])
                if "span2" in t:
                    d["span2"] = tuple(t["span2"])
                d.update(_get_nested_vals(t, "info"))
                d.update(_get_nested_vals(t, "preds"))
                tr_records.append(d)
        return ex_records, tr_records