Ejemplo n.º 1
0
    def __init__(self, path, max_seq_len, name="qamr", **kw):
        self.path = path
        super(QAMRTask, self).__init__(name, **kw)
        self.max_seq_len = max_seq_len

        self.train_data_text = None
        self.val_data_text = None
        self.test_data_text = None

        self.f1_metric = F1SpanMetric()
        self.em_metric = ExactMatchSpanMetric()

        self.val_metric = "%s_avg" % self.name
        self.val_metric_decreases = False
Ejemplo n.º 2
0
    def __init__(self, path, max_seq_len, name, **kw):
        """QA-SRL (Question-Answer Driven Semantic Role Labeling)
        See http://qasrl.org/
        Download, unzip, and rename the "qasrl-v2" folder to "QASRL"
        """
        super(QASRLTask, self).__init__(name, **kw)
        self.path = path
        self.max_seq_len = max_seq_len

        self.train_data_text = None
        self.val_data_text = None
        self.test_data_text = None

        self.f1_metric = F1SpanMetric()
        self.em_metric = ExactMatchSpanMetric()

        self.val_metric = "%s_avg" % self.name
        self.val_metric_decreases = False
Ejemplo n.º 3
0
class QAMRTask(SpanPredictionTask):
    """ Question-Answer Meaning Representation (QAMR)
        https://github.com/uwnlp/qamr
    """
    def __init__(self, path, max_seq_len, name="qamr", **kw):
        self.path = path
        super(QAMRTask, self).__init__(name, **kw)
        self.max_seq_len = max_seq_len

        self.train_data_text = None
        self.val_data_text = None
        self.test_data_text = None

        self.f1_metric = F1SpanMetric()
        self.em_metric = ExactMatchSpanMetric()

        self.val_metric = "%s_avg" % self.name
        self.val_metric_decreases = False

    def get_metrics(self, reset: bool = False) -> Dict:
        f1 = self.f1_metric.get_metric(reset)
        em = self.em_metric.get_metric(reset)
        collected_metrics = {"f1": f1, "em": em, "avg": (f1 + em) / 2}
        return collected_metrics

    def get_sentences(self) -> Iterable[Sequence[str]]:
        """ Yield sentences, used to compute vocabulary. """
        yield from self.sentences

    def process_split(
            self, split, indexers,
            model_preprocessing_interface) -> Iterable[Type[Instance]]:
        def _make_instance(example):
            d = dict()

            # For human-readability
            d["raw_passage"] = MetadataField(" ".join(example["passage"]))
            d["raw_question"] = MetadataField(" ".join(example["question"]))

            if model_preprocessing_interface.model_flags[
                    "uses_pair_embedding"]:
                inp, start_offset, _ = model_preprocessing_interface.boundary_token_fn(
                    example["passage"], example["question"], get_offset=True)
                d["inputs"] = sentence_to_text_field(inp, indexers)
            else:
                d["passage"] = sentence_to_text_field(
                    model_preprocessing_interface.boundary_token_fn(
                        example["passage"]), indexers)
                d["question"] = sentence_to_text_field(
                    model_preprocessing_interface.boundary_token_fn(
                        example["question"]), indexers)
                start_offset = 0
            d["span_start"] = NumericField(example["answer_span"][0] +
                                           start_offset,
                                           label_namespace="span_start_labels")
            d["span_end"] = NumericField(example["answer_span"][1] +
                                         start_offset,
                                         label_namespace="span_end_labels")
            d["start_offset"] = MetadataField(start_offset)
            d["passage_str"] = MetadataField(example["passage_str"])
            d["answer_str"] = MetadataField(example["answer_str"])
            d["space_processed_token_map"] = MetadataField(
                example["space_processed_token_map"])
            return Instance(d)

        instances = map(_make_instance, split)
        return instances

    def get_split_text(self, split: str):
        return getattr(self, "%s_data_text" % split)

    @classmethod
    def load_tsv_dataset(cls, path, wiki_dict):
        df = pd.read_csv(
            path,
            sep="\t",
            header=None,
            names=[
                "sent_id",
                "target_ids",
                "worker_id",
                "qa_index",
                "qa_word",
                "question",
                "answer",
                "response1",
                "response2",
            ],
        )
        df["sent"] = df["sent_id"].apply(wiki_dict.get)
        return df

    def process_dataset(self, data_df, shuffle=False):
        example_list = []
        moses = MosesTokenizer()
        for i, row in data_df.iterrows():
            # Answer indices are a space-limited list of numbers.
            # We simply take the min/max of the indices
            answer_idxs = list(map(int, row["answer"].split()))
            ans_tok_start, ans_tok_end = min(
                answer_idxs), max(answer_idxs) + 1  # Exclusive

            remapped_result = remap_ptb_passage_and_answer_spans(
                ptb_tokens=row["sent"].split(),
                answer_span=(ans_tok_start, ans_tok_end),
                moses=moses,
                tokenizer_name=self.tokenizer_name,
            )
            example_list.append({
                "passage":
                self._process_sentence(remapped_result["detok_sent"]),
                "question":
                self._process_sentence(row["question"]),
                "answer_span":
                remapped_result["answer_token_span"],
                "passage_str":
                remapped_result["detok_sent"],
                "answer_str":
                remapped_result["answer_str"],
                "space_processed_token_map":
                remapped_result["space_processed_token_map"],
            })

        if shuffle:
            random.Random(12345).shuffle(example_list)

        return example_list

    def _process_sentence(self, sent):
        return tokenize_and_truncate(tokenizer_name=self.tokenizer_name,
                                     sent=sent,
                                     max_seq_len=self.max_seq_len)

    @classmethod
    def load_wiki_dict(cls, path):
        wiki_df = pd.read_csv(path, sep="\t", names=["sent_id", "text"])
        wiki_dict = {
            row["sent_id"]: row["text"]
            for _, row in wiki_df.iterrows()
        }
        return wiki_dict

    def load_data(self):
        wiki_dict = self.load_wiki_dict(
            os.path.join(self.path, "qamr/data/wiki-sentences.tsv"))
        self.train_data_text = self.process_dataset(
            self.load_tsv_dataset(path=os.path.join(
                self.path, "qamr/data/filtered/train.tsv"),
                                  wiki_dict=wiki_dict))
        self.val_data_text = self.process_dataset(
            self.load_tsv_dataset(path=os.path.join(
                self.path, "qamr/data/filtered/dev.tsv"),
                                  wiki_dict=wiki_dict),
            shuffle=True,
        )
        self.test_data_text = self.process_dataset(
            self.load_tsv_dataset(path=os.path.join(
                self.path, "qamr/data/filtered/test.tsv"),
                                  wiki_dict=wiki_dict))

        self.sentences = (
            [example["passage"] for example in self.train_data_text] +
            [example["question"] for example in self.train_data_text] +
            [example["passage"] for example in self.val_data_text] +
            [example["question"] for example in self.val_data_text])
        self.example_counts = {
            "train": len(self.train_data_text),
            "val": len(self.val_data_text),
            "test": len(self.test_data_text),
        }

    @staticmethod
    def collapse_contiguous_indices(ls):
        """
        [2, 3, 4, 5, 6, 7, 8] -> [(2, 9)]
        [1, 2, 4, 5] -> [(1, 3), (4, 6)]
        """
        if not ls:
            return []
        output = []
        start = None
        prev = None
        for n in ls:
            if start is None:
                start = n
                prev = n
            elif n == prev + 1:
                prev += 1
                continue
            else:
                output.append((start, prev + 1))  # exclusive
                start = n
                prev = n
        output.append((start, prev + 1))  # exclusive
        return output
Ejemplo n.º 4
0
class QASRLTask(SpanPredictionTask):
    def __init__(self, path, max_seq_len, name, **kw):
        """QA-SRL (Question-Answer Driven Semantic Role Labeling)
        See http://qasrl.org/
        Download, unzip, and rename the "qasrl-v2" folder to "QASRL"
        """
        super(QASRLTask, self).__init__(name, **kw)
        self.path = path
        self.max_seq_len = max_seq_len

        self.train_data_text = None
        self.val_data_text = None
        self.test_data_text = None

        self.f1_metric = F1SpanMetric()
        self.em_metric = ExactMatchSpanMetric()

        self.val_metric = "%s_avg" % self.name
        self.val_metric_decreases = False

    def count_examples(self, splits=["train", "val", "test"]):
        """ Count examples in the dataset. """
        pass

    def get_metrics(self, reset: bool = False) -> Dict:
        f1 = self.f1_metric.get_metric(reset)
        em = self.em_metric.get_metric(reset)
        collected_metrics = {"f1": f1, "em": em, "avg": (f1 + em) / 2}
        return collected_metrics

    def load_data(self):
        self.train_data_text = self._load_file(
            os.path.join(self.path, "orig", "train.jsonl.gz"))

        # Shuffle val_data to ensure diversity in periodic validation with val_data_limit
        self.val_data_text = self._load_file(os.path.join(
            self.path, "orig", "dev.jsonl.gz"),
                                             shuffle=True)

        self.test_data_text = self._load_file(
            os.path.join(self.path, "orig", "test.jsonl.gz"))

        self.sentences = (
            [example["passage"] for example in self.train_data_text] +
            [example["question"] for example in self.train_data_text] +
            [example["passage"] for example in self.val_data_text] +
            [example["question"] for example in self.val_data_text])
        self.example_counts = {
            "train": len(self.train_data_text),
            "val": len(self.val_data_text),
            "test": len(self.test_data_text),
        }

    def get_sentences(self) -> Iterable[Sequence[str]]:
        """ Yield sentences, used to compute vocabulary. """
        yield from self.sentences

    def process_split(
            self, split, indexers,
            model_preprocessing_interface) -> Iterable[Type[Instance]]:
        def _make_instance(example):
            d = dict()

            # For human-readability
            d["raw_passage"] = MetadataField(" ".join(example["passage"]))
            d["raw_question"] = MetadataField(" ".join(example["question"]))

            if model_preprocessing_interface.model_flags[
                    "uses_pair_embedding"]:
                inp, start_offset, _ = model_preprocessing_interface.boundary_token_fn(
                    example["passage"], example["question"], get_offset=True)
                d["inputs"] = sentence_to_text_field(inp, indexers)
            else:
                d["passage"] = sentence_to_text_field(
                    model_preprocessing_interface.boundary_token_fn(
                        example["passage"]), indexers)
                d["question"] = sentence_to_text_field(
                    model_preprocessing_interface.boundary_token_fn(
                        example["question"]), indexers)
                start_offset = 0
            d["span_start"] = NumericField(example["answer_span"][0] +
                                           start_offset,
                                           label_namespace="span_start_labels")
            d["span_end"] = NumericField(example["answer_span"][1] +
                                         start_offset,
                                         label_namespace="span_end_labels")
            d["start_offset"] = MetadataField(start_offset)
            d["passage_str"] = MetadataField(example["passage_str"])
            d["answer_str"] = MetadataField(example["answer_str"])
            d["space_processed_token_map"] = MetadataField(
                example["space_processed_token_map"])
            return Instance(d)

        instances = map(_make_instance, split)
        return instances

    def _load_file(self, path, shuffle=False):
        example_list = []
        moses = MosesTokenizer()
        failed = 0
        with gzip.open(path) as f:
            lines = f.read().splitlines()

            for line in lines:
                datum = self.preprocess_qasrl_datum(json.loads(line))
                for entry in datum["entries"]:
                    for question, answer_list in entry["questions"].items():
                        for answer in answer_list:
                            for answer_span in answer:
                                answer_tok_span = (
                                    answer_span["span"][0],
                                    answer_span["span"][1] + 1,  # exclusive
                                )
                                try:
                                    remapped_result = remap_ptb_passage_and_answer_spans(
                                        ptb_tokens=datum["sentence_tokens"],
                                        answer_span=answer_tok_span,
                                        moses=moses,
                                        # We can move the aligned outside the loop, actually
                                        tokenizer_name=self.tokenizer_name,
                                    )
                                except ValueError:
                                    failed += 1
                                    continue
                                example_list.append({
                                    "passage":
                                    self._process_sentence(
                                        remapped_result["detok_sent"]),
                                    "question":
                                    self._process_sentence(question),
                                    "answer_span":
                                    remapped_result["answer_token_span"],
                                    "passage_str":
                                    remapped_result["detok_sent"],
                                    "answer_str":
                                    remapped_result["answer_str"],
                                    "space_processed_token_map":
                                    remapped_result[
                                        "space_processed_token_map"],
                                })

        if failed:
            log.info("FAILED ({}): {}".format(failed, path))

        if shuffle:
            random.Random(1234).shuffle(example_list)
        return example_list

    def _process_sentence(self, sent):
        return tokenize_and_truncate(tokenizer_name=self.tokenizer_name,
                                     sent=sent,
                                     max_seq_len=self.max_seq_len)

    def get_split_text(self, split: str):
        return getattr(self, "%s_data_text" % split)

    @classmethod
    def preprocess_qasrl_datum(cls, datum):
        """ Extract relevant fields """
        return {
            "sentence_tokens":
            datum["sentenceTokens"],
            "entries": [{
                "verb": verb_entry["verbInflectedForms"]["stem"],
                "verb_idx": verb_idx,
                "questions": {
                    question: [[{
                        "tokens":
                        datum["sentenceTokens"][span[0]:span[1] + 1],
                        "span":
                        span,
                    } for span in answer_judgment["spans"]]
                               for answer_judgment in q_data["answerJudgments"]
                               if answer_judgment["isValid"]]
                    for question, q_data in
                    verb_entry["questionLabels"].items()
                },
            } for verb_idx, verb_entry in datum["verbEntries"].items()],
        }