def __init__(self, path, max_seq_len, name="qamr", **kw): self.path = path super(QAMRTask, self).__init__(name, **kw) self.max_seq_len = max_seq_len self.train_data_text = None self.val_data_text = None self.test_data_text = None self.f1_metric = F1SpanMetric() self.em_metric = ExactMatchSpanMetric() self.val_metric = "%s_avg" % self.name self.val_metric_decreases = False
def __init__(self, path, max_seq_len, name, **kw): """QA-SRL (Question-Answer Driven Semantic Role Labeling) See http://qasrl.org/ Download, unzip, and rename the "qasrl-v2" folder to "QASRL" """ super(QASRLTask, self).__init__(name, **kw) self.path = path self.max_seq_len = max_seq_len self.train_data_text = None self.val_data_text = None self.test_data_text = None self.f1_metric = F1SpanMetric() self.em_metric = ExactMatchSpanMetric() self.val_metric = "%s_avg" % self.name self.val_metric_decreases = False
class QAMRTask(SpanPredictionTask): """ Question-Answer Meaning Representation (QAMR) https://github.com/uwnlp/qamr """ def __init__(self, path, max_seq_len, name="qamr", **kw): self.path = path super(QAMRTask, self).__init__(name, **kw) self.max_seq_len = max_seq_len self.train_data_text = None self.val_data_text = None self.test_data_text = None self.f1_metric = F1SpanMetric() self.em_metric = ExactMatchSpanMetric() self.val_metric = "%s_avg" % self.name self.val_metric_decreases = False def get_metrics(self, reset: bool = False) -> Dict: f1 = self.f1_metric.get_metric(reset) em = self.em_metric.get_metric(reset) collected_metrics = {"f1": f1, "em": em, "avg": (f1 + em) / 2} return collected_metrics def get_sentences(self) -> Iterable[Sequence[str]]: """ Yield sentences, used to compute vocabulary. """ yield from self.sentences def process_split( self, split, indexers, model_preprocessing_interface) -> Iterable[Type[Instance]]: def _make_instance(example): d = dict() # For human-readability d["raw_passage"] = MetadataField(" ".join(example["passage"])) d["raw_question"] = MetadataField(" ".join(example["question"])) if model_preprocessing_interface.model_flags[ "uses_pair_embedding"]: inp, start_offset, _ = model_preprocessing_interface.boundary_token_fn( example["passage"], example["question"], get_offset=True) d["inputs"] = sentence_to_text_field(inp, indexers) else: d["passage"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn( example["passage"]), indexers) d["question"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn( example["question"]), indexers) start_offset = 0 d["span_start"] = NumericField(example["answer_span"][0] + start_offset, label_namespace="span_start_labels") d["span_end"] = NumericField(example["answer_span"][1] + start_offset, label_namespace="span_end_labels") d["start_offset"] = MetadataField(start_offset) d["passage_str"] = MetadataField(example["passage_str"]) d["answer_str"] = MetadataField(example["answer_str"]) d["space_processed_token_map"] = MetadataField( example["space_processed_token_map"]) return Instance(d) instances = map(_make_instance, split) return instances def get_split_text(self, split: str): return getattr(self, "%s_data_text" % split) @classmethod def load_tsv_dataset(cls, path, wiki_dict): df = pd.read_csv( path, sep="\t", header=None, names=[ "sent_id", "target_ids", "worker_id", "qa_index", "qa_word", "question", "answer", "response1", "response2", ], ) df["sent"] = df["sent_id"].apply(wiki_dict.get) return df def process_dataset(self, data_df, shuffle=False): example_list = [] moses = MosesTokenizer() for i, row in data_df.iterrows(): # Answer indices are a space-limited list of numbers. # We simply take the min/max of the indices answer_idxs = list(map(int, row["answer"].split())) ans_tok_start, ans_tok_end = min( answer_idxs), max(answer_idxs) + 1 # Exclusive remapped_result = remap_ptb_passage_and_answer_spans( ptb_tokens=row["sent"].split(), answer_span=(ans_tok_start, ans_tok_end), moses=moses, tokenizer_name=self.tokenizer_name, ) example_list.append({ "passage": self._process_sentence(remapped_result["detok_sent"]), "question": self._process_sentence(row["question"]), "answer_span": remapped_result["answer_token_span"], "passage_str": remapped_result["detok_sent"], "answer_str": remapped_result["answer_str"], "space_processed_token_map": remapped_result["space_processed_token_map"], }) if shuffle: random.Random(12345).shuffle(example_list) return example_list def _process_sentence(self, sent): return tokenize_and_truncate(tokenizer_name=self.tokenizer_name, sent=sent, max_seq_len=self.max_seq_len) @classmethod def load_wiki_dict(cls, path): wiki_df = pd.read_csv(path, sep="\t", names=["sent_id", "text"]) wiki_dict = { row["sent_id"]: row["text"] for _, row in wiki_df.iterrows() } return wiki_dict def load_data(self): wiki_dict = self.load_wiki_dict( os.path.join(self.path, "qamr/data/wiki-sentences.tsv")) self.train_data_text = self.process_dataset( self.load_tsv_dataset(path=os.path.join( self.path, "qamr/data/filtered/train.tsv"), wiki_dict=wiki_dict)) self.val_data_text = self.process_dataset( self.load_tsv_dataset(path=os.path.join( self.path, "qamr/data/filtered/dev.tsv"), wiki_dict=wiki_dict), shuffle=True, ) self.test_data_text = self.process_dataset( self.load_tsv_dataset(path=os.path.join( self.path, "qamr/data/filtered/test.tsv"), wiki_dict=wiki_dict)) self.sentences = ( [example["passage"] for example in self.train_data_text] + [example["question"] for example in self.train_data_text] + [example["passage"] for example in self.val_data_text] + [example["question"] for example in self.val_data_text]) self.example_counts = { "train": len(self.train_data_text), "val": len(self.val_data_text), "test": len(self.test_data_text), } @staticmethod def collapse_contiguous_indices(ls): """ [2, 3, 4, 5, 6, 7, 8] -> [(2, 9)] [1, 2, 4, 5] -> [(1, 3), (4, 6)] """ if not ls: return [] output = [] start = None prev = None for n in ls: if start is None: start = n prev = n elif n == prev + 1: prev += 1 continue else: output.append((start, prev + 1)) # exclusive start = n prev = n output.append((start, prev + 1)) # exclusive return output
class QASRLTask(SpanPredictionTask): def __init__(self, path, max_seq_len, name, **kw): """QA-SRL (Question-Answer Driven Semantic Role Labeling) See http://qasrl.org/ Download, unzip, and rename the "qasrl-v2" folder to "QASRL" """ super(QASRLTask, self).__init__(name, **kw) self.path = path self.max_seq_len = max_seq_len self.train_data_text = None self.val_data_text = None self.test_data_text = None self.f1_metric = F1SpanMetric() self.em_metric = ExactMatchSpanMetric() self.val_metric = "%s_avg" % self.name self.val_metric_decreases = False def count_examples(self, splits=["train", "val", "test"]): """ Count examples in the dataset. """ pass def get_metrics(self, reset: bool = False) -> Dict: f1 = self.f1_metric.get_metric(reset) em = self.em_metric.get_metric(reset) collected_metrics = {"f1": f1, "em": em, "avg": (f1 + em) / 2} return collected_metrics def load_data(self): self.train_data_text = self._load_file( os.path.join(self.path, "orig", "train.jsonl.gz")) # Shuffle val_data to ensure diversity in periodic validation with val_data_limit self.val_data_text = self._load_file(os.path.join( self.path, "orig", "dev.jsonl.gz"), shuffle=True) self.test_data_text = self._load_file( os.path.join(self.path, "orig", "test.jsonl.gz")) self.sentences = ( [example["passage"] for example in self.train_data_text] + [example["question"] for example in self.train_data_text] + [example["passage"] for example in self.val_data_text] + [example["question"] for example in self.val_data_text]) self.example_counts = { "train": len(self.train_data_text), "val": len(self.val_data_text), "test": len(self.test_data_text), } def get_sentences(self) -> Iterable[Sequence[str]]: """ Yield sentences, used to compute vocabulary. """ yield from self.sentences def process_split( self, split, indexers, model_preprocessing_interface) -> Iterable[Type[Instance]]: def _make_instance(example): d = dict() # For human-readability d["raw_passage"] = MetadataField(" ".join(example["passage"])) d["raw_question"] = MetadataField(" ".join(example["question"])) if model_preprocessing_interface.model_flags[ "uses_pair_embedding"]: inp, start_offset, _ = model_preprocessing_interface.boundary_token_fn( example["passage"], example["question"], get_offset=True) d["inputs"] = sentence_to_text_field(inp, indexers) else: d["passage"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn( example["passage"]), indexers) d["question"] = sentence_to_text_field( model_preprocessing_interface.boundary_token_fn( example["question"]), indexers) start_offset = 0 d["span_start"] = NumericField(example["answer_span"][0] + start_offset, label_namespace="span_start_labels") d["span_end"] = NumericField(example["answer_span"][1] + start_offset, label_namespace="span_end_labels") d["start_offset"] = MetadataField(start_offset) d["passage_str"] = MetadataField(example["passage_str"]) d["answer_str"] = MetadataField(example["answer_str"]) d["space_processed_token_map"] = MetadataField( example["space_processed_token_map"]) return Instance(d) instances = map(_make_instance, split) return instances def _load_file(self, path, shuffle=False): example_list = [] moses = MosesTokenizer() failed = 0 with gzip.open(path) as f: lines = f.read().splitlines() for line in lines: datum = self.preprocess_qasrl_datum(json.loads(line)) for entry in datum["entries"]: for question, answer_list in entry["questions"].items(): for answer in answer_list: for answer_span in answer: answer_tok_span = ( answer_span["span"][0], answer_span["span"][1] + 1, # exclusive ) try: remapped_result = remap_ptb_passage_and_answer_spans( ptb_tokens=datum["sentence_tokens"], answer_span=answer_tok_span, moses=moses, # We can move the aligned outside the loop, actually tokenizer_name=self.tokenizer_name, ) except ValueError: failed += 1 continue example_list.append({ "passage": self._process_sentence( remapped_result["detok_sent"]), "question": self._process_sentence(question), "answer_span": remapped_result["answer_token_span"], "passage_str": remapped_result["detok_sent"], "answer_str": remapped_result["answer_str"], "space_processed_token_map": remapped_result[ "space_processed_token_map"], }) if failed: log.info("FAILED ({}): {}".format(failed, path)) if shuffle: random.Random(1234).shuffle(example_list) return example_list def _process_sentence(self, sent): return tokenize_and_truncate(tokenizer_name=self.tokenizer_name, sent=sent, max_seq_len=self.max_seq_len) def get_split_text(self, split: str): return getattr(self, "%s_data_text" % split) @classmethod def preprocess_qasrl_datum(cls, datum): """ Extract relevant fields """ return { "sentence_tokens": datum["sentenceTokens"], "entries": [{ "verb": verb_entry["verbInflectedForms"]["stem"], "verb_idx": verb_idx, "questions": { question: [[{ "tokens": datum["sentenceTokens"][span[0]:span[1] + 1], "span": span, } for span in answer_judgment["spans"]] for answer_judgment in q_data["answerJudgments"] if answer_judgment["isValid"]] for question, q_data in verb_entry["questionLabels"].items() }, } for verb_idx, verb_entry in datum["verbEntries"].items()], }