class CommitmentBankTask(Task): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch TASK_TYPE = TaskTypes.CLASSIFICATION LABELS = ["neutral", "entailment", "contradiction"] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for line in lines: examples.append( Example( guid="%s-%s" % (set_type, line["idx"]), input_premise=line["premise"], input_hypothesis=line["hypothesis"], label=line["label"] if set_type != "test" else cls.LABELS[-1], )) return examples
class BoolQTask(Task): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch TASK_TYPE = TaskTypes.CLASSIFICATION LABELS = [False, True] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for (i, line) in enumerate(lines): examples.append( Example( guid="%s-%s" % (set_type, i), input_question=line["question"], input_passage=line["passage"], label=line["label"] if set_type != "test" else cls.LABELS[-1], )) return examples
class MutualPlusTask(mc_template.AbstractMultipleChoiceTask): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch CHOICE_KEYS = ["A", "B", "C", "D"] CHOICE_TO_ID, ID_TO_CHOICE = labels_to_bimap(CHOICE_KEYS) NUM_CHOICES = len(CHOICE_KEYS) def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for i, line in enumerate(lines): examples.append( Example( guid="%s-%s" % (set_type, i), prompt=line["article"], choice_list=[d for d in line["options"]], label=line["answers"], )) return examples
class SentevalTenseTask(Task): Example = Example TokenizedExample = TokenizedExample DataRow = DataRow Batch = Batch TASK_TYPE = TaskTypes.CLASSIFICATION LABELS = ["PAST", "PRES"] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) def get_train_examples(self): return self._create_examples(path=self.train_path, set_type="train") def get_val_examples(self): return self._create_examples(path=self.val_path, set_type="val") def get_test_examples(self): return self._create_examples(path=self.test_path, set_type="test") @classmethod def _create_examples(cls, path, set_type): examples = [] df = pd.read_csv(path, index_col=0, names=["split", "label", "text", "unk_1", "unk_2"]) for i, row in df.iterrows(): examples.append( Example( guid="%s-%s" % (set_type, i), text=row.text, label=row.label if set_type != "test" else cls.LABELS[-1], )) return examples
class CosmosQATask(mc_template.AbstractMultipleChoiceTask): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch CHOICE_KEYS = [0, 1, 2, 3] CHOICE_TO_ID, ID_TO_CHOICE = labels_to_bimap(CHOICE_KEYS) NUM_CHOICES = len(CHOICE_KEYS) def get_train_examples(self): return self._create_examples(path=self.train_path, set_type="train") def get_val_examples(self): return self._create_examples(path=self.val_path, set_type="val") def get_test_examples(self): return self._create_examples(path=self.test_path, set_type="test") @classmethod def _create_examples(cls, path, set_type): df = pd.read_csv(path) examples = [] for i, row in enumerate(df.itertuples()): examples.append( Example( guid="%s-%s" % (set_type, i), prompt=row.context + " " + row.question, choice_list=[ row.answer0, row.answer1, row.answer2, row.answer3 ], label=row.label if set_type != "test" else cls.CHOICE_KEYS[-1], )) return examples
class AdversarialNliTask(Task): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch TASK_TYPE = TaskTypes.CLASSIFICATION LABELS = ["c", "e", "n"] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for (i, line) in enumerate(lines): examples.append( Example( guid="%s-%s" % (set_type, i), input_premise=line["context"], input_hypothesis=line["hypothesis"], label=line["label"] if set_type != "test" else cls.LABELS[-1], )) return examples
class MrpcTask(GlueMixin, Task): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch TASK_TYPE = TaskTypes.CLASSIFICATION LABELS = ["0", "1"] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) def get_train_examples(self): return self._create_examples(lines=read_jsonl(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_jsonl(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_jsonl(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for (i, line) in enumerate(lines): examples.append( Example( # NOTE: get_glue_preds() is dependent on this guid format. guid="%s-%s" % (set_type, i), text_a=line["text_a"], text_b=line["text_b"], label=line["label"] if set_type != "test" else cls.LABELS[-1], ) ) return examples
class SpatialTask(Task): TASK_TYPE = TaskTypes.CLASSIFICATION LABELS = ["0", "1"] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) Batch = Batch def get_train_examples(self): return self._create_examples(path=self.train_path, set_type="train") def get_val_examples(self): return self._create_examples(path=self.val_path, set_type="val") def get_test_examples(self): return self._create_examples(path=self.test_path, set_type="test") @classmethod def _create_examples(cls, path, set_type): examples = [] df = pd.read_csv(path, index_col=0) for i, row in df.iterrows(): examples.append( Example( guid="%s-%s" % (set_type, i), sentence=row.sentence, label=row.label if set_type != "test" else cls.LABELS[-1], )) return examples
class SentEvalTopConstituentsTask(base.BaseSentEvalTask): Example = Example TokenizedExample = TokenizedExample DataRow = DataRow Batch = Batch LABELS = [ "ADVP_NP_VP_.", "CC_ADVP_NP_VP_.", "CC_NP_VP_.", "IN_NP_VP_.", "NP_ADVP_VP_.", "NP_NP_VP_.", "NP_PP_.", "NP_VP_.", "OTHER", "PP_NP_VP_.", "RB_NP_VP_.", "SBAR_NP_VP_.", "SBAR_VP_.", "S_CC_S_.", "S_NP_VP_.", "S_VP_.", "VBD_NP_VP_.", "VP_.", "WHADVP_SQ_.", "WHNP_SQ_.", ] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS)
class WinograndeTask(mc_template.AbstractMultipleChoiceTask): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch CHOICE_KEYS = [1, 2] CHOICE_TO_ID, ID_TO_CHOICE = labels_to_bimap(CHOICE_KEYS) NUM_CHOICES = len(CHOICE_KEYS) def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for i, line in enumerate(lines): choice_pre, option = line["sentence"].split("_") examples.append( Example( guid="%s-%s" % (set_type, i), prompt=option, choice_list=[choice_pre + line["option1"], choice_pre + line["option2"]], label=int(line["answer"]) if set_type != "test" else cls.CHOICE_KEYS[-1], ) ) return examples
class QuailTask(mc_template.AbstractMultipleChoiceTask): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch CHOICE_KEYS = [0, 1, 2, 3] CHOICE_TO_ID, ID_TO_CHOICE = labels_to_bimap(CHOICE_KEYS) NUM_CHOICES = len(CHOICE_KEYS) def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for i, line in enumerate(lines): examples.append( Example( guid="%s-%s" % (set_type, i), prompt=line["context"] + " " + line["question"], choice_list=[d for d in line["answers"]], label=line["correct_answer_id"], )) return examples
class RuHumorTask(Task): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch TASK_TYPE = TaskTypes.CLASSIFICATION LABELS = ["0", "1"] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) def get_train_examples(self): return self._create_examples(lines=read_jsonl(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_jsonl(self.path_dict['validation']), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_jsonl(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for (i, line) in enumerate(lines): examples.append( Example( guid="%s-%s" % (set_type, i), text=line["text"], label=str(line["label"]) if set_type != "test" else cls.LABELS[-1], ) ) return examples
class WiCTask(SuperGlueMixin, Task): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch TASK_TYPE = TaskTypes.SPAN_COMPARISON_CLASSIFICATION LABELS = [False, True] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) @property def num_spans(self): return 2 def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for line in lines: span1 = ExclusiveSpan(int(line["start1"]), int(line["end1"])) span2 = ExclusiveSpan(int(line["start2"]), int(line["end2"])) # Note, the chosen word may be different (e.g. different tenses) in sent1 and sent2, # hence we don't do an assert here. examples.append( Example( # NOTE: WiCTask.super_glue_format_preds() is dependent on this guid format. guid="%s-%s" % (set_type, line["idx"]), sentence1=line["sentence1"], sentence2=line["sentence2"], word=line["word"], span1=span1, span2=span2, label=line["label"] if set_type != "test" else cls.LABELS[-1], )) return examples @classmethod def super_glue_format_preds(cls, pred_dict): """Reformat this task's raw predictions to have the structure expected by SuperGLUE.""" lines = [] for pred, guid in zip(list(pred_dict["preds"]), list(pred_dict["guids"])): lines.append({ "idx": int(guid.split("-")[1]), "label": str(cls.LABELS[pred]).lower() }) return lines
class SemevalTask(edge_probing_two_span.AbstractProbingTask): Example = Example TokenizedExample = TokenizedExample DataRow = DataRow Batch = Batch LABELS = [ "Cause-Effect(e1,e2)", "Cause-Effect(e2,e1)", "Component-Whole(e1,e2)", "Component-Whole(e2,e1)", "Content-Container(e1,e2)", "Content-Container(e2,e1)", "Entity-Destination(e1,e2)", "Entity-Destination(e2,e1)", "Entity-Origin(e1,e2)", "Entity-Origin(e2,e1)", "Instrument-Agency(e1,e2)", "Instrument-Agency(e2,e1)", "Member-Collection(e1,e2)", "Member-Collection(e2,e1)", "Message-Topic(e1,e2)", "Message-Topic(e2,e1)", "Other", "Product-Producer(e1,e2)", "Product-Producer(e2,e1)", ] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) @property def num_spans(self): return 2 def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for (line_num, line) in enumerate(lines): for (target_num, target) in enumerate(line["targets"]): span1 = target["span1"] span2 = target["span2"] examples.append( Example( guid="%s-%s-%s" % (set_type, line_num, target_num), text=line["text"], span1=span1, span2=span2, labels=[target["label"]] if set_type != "test" else [cls.LABELS[-1]], ) ) return examples
class NerTask(edge_probing_single_span.AbstractProbingTask): Example = Example TokenizedExample = TokenizedExample DataRow = DataRow Batch = Batch LABELS = [ "CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG", "PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART", ] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) @property def num_spans(self): return 1 def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for (line_num, line) in enumerate(lines): for (target_num, target) in enumerate(line["targets"]): span = target["span1"] examples.append( Example( guid="%s-%s-%s" % (set_type, line_num, target_num), text=line["text"], span=span, labels=[target["label"]] if set_type != "test" else [cls.LABELS[-1]], )) return examples
class MultiRCTask(Task): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch TASK_TYPE = TaskTypes.CLASSIFICATION LABELS = [0, 1] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) def __init__(self, name, path_dict, filter_sentences=True): super().__init__(name=name, path_dict=path_dict) self.name = name self.path_dict = path_dict self.filter_sentences = filter_sentences def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") def _create_examples(self, lines, set_type): examples = [] question_id = 0 for line in lines: soup = bs4.BeautifulSoup(line["passage"]["text"], features="lxml") sentence_ls = [] for i, elem in enumerate(soup.html.body.contents): if isinstance(elem, bs4.element.NavigableString): sentence_ls.append(str(elem).strip()) for question_dict in line["passage"]["questions"]: question = question_dict["question"] if self.filter_sentences: paragraph = " ".join( sentence for i, sentence in enumerate(sentence_ls, start=1) if i in question_dict["sentences_used"] ) else: paragraph = " ".join(sentence_ls) for answer_dict in question_dict["answers"]: answer = answer_dict["text"] examples.append( Example( guid="%s-%s" % (set_type, line["idx"]), paragraph=paragraph, question=question, answer=answer, label=answer_dict["label"] if set_type != "test" else self.LABELS[-1], question_id=question_id, ) ) question_id += 1 return examples
class SentEvalObjNumberTask(base.BaseSentEvalTask): Example = Example TokenizedExample = TokenizedExample DataRow = DataRow Batch = Batch LABELS = ["NN", "NNS"] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS)
class SentEvalPastPresentTask(base.BaseSentEvalTask): Example = Example TokenizedExample = TokenizedExample DataRow = DataRow Batch = Batch LABELS = ["PAST", "PRES"] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS)
class SentEvalBigramShiftTask(base.BaseSentEvalTask): Example = Example TokenizedExample = TokenizedExample DataRow = DataRow Batch = Batch LABELS = ["I", "O"] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS)
class SentEvalSentenceLengthTask(base.BaseSentEvalTask): Example = Example TokenizedExample = TokenizedExample DataRow = DataRow Batch = Batch LABELS = [0, 1, 2, 3, 4, 5, 6] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS)
class SentEvalCoordinationInversionTask(base.BaseSentEvalTask): Example = Example TokenizedExample = TokenizedExample DataRow = DataRow Batch = Batch LABELS = ["I", "O"] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS)
class SentEvalTreeDepthTask(base.BaseSentEvalTask): Example = Example TokenizedExample = TokenizedExample DataRow = DataRow Batch = Batch LABELS = [5, 6, 7, 8, 9, 10, 11] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS)
class ReCoRDTask(Task): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch TASK_TYPE = TaskTypes.CLASSIFICATION LABELS = [False, True] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for line in lines: passage_text = line["passage"]["text"] for qas in line["qas"]: answers_dict = {} if set_type != "test": answers_dict = {(answer["start"], answer["end"]): answer["text"] for answer in qas["answers"]} for entity in line["passage"]["entities"]: label = False entity_span = (entity["start"], entity["end"]) if set_type != "test": if entity_span in answers_dict: assert ( passage_text[entity_span[0]:entity_span[1] + 1] == answers_dict[entity_span]) label = True examples.append( Example( guid="%s-%s" % (set_type, len(examples)), passage_text=passage_text, query_text=qas["query"], entity_start_char_idx=entity_span[0], entity_end_char_idx=entity_span[1] + 1, # make exclusive entity_str=passage_text[ entity_span[0]:entity_span[1] + 1], passage_idx=line["idx"], question_idx=qas["idx"], answers_dict=answers_dict, label=label, )) return examples
class WSCTask(SuperGlueMixin, Task): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch TASK_TYPE = TaskTypes.SPAN_COMPARISON_CLASSIFICATION LABELS = [False, True] LABEL_TO_ID, ID_TO_LABEL = labels_to_bimap(LABELS) @property def num_spans(self): return 2 def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for line in lines: # # # # Leo's finding: potentially general bug # "span1_index" etc. are in line["target"] assert "target" in line examples.append( Example( # NOTE: WSCTask.super_glue_format_preds() is dependent on this guid format. guid="%s-%s" % (set_type, line["idx"]), text=line["text"], span1_idx=line['target']["span1_index"], span2_idx=line["target"]["span2_index"], span1_text=line["target"]["span1_text"], span2_text=line["target"]["span2_text"], label=line["label"] if set_type != "test" else cls.LABELS[-1], )) return examples @classmethod def super_glue_format_preds(cls, pred_dict): """Reformat this task's raw predictions to have the structure expected by SuperGLUE.""" lines = [] for pred, guid in zip(list(pred_dict["preds"]), list(pred_dict["guids"])): lines.append({ "idx": int(guid.split("-")[1]), "label": str(cls.LABELS[pred]) }) return lines
class ArctTask(mc_template.AbstractMultipleChoiceTask): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch CHOICE_KEYS = [0, 1] CHOICE_TO_ID, ID_TO_CHOICE = labels_to_bimap(CHOICE_KEYS) NUM_CHOICES = len(CHOICE_KEYS) def get_train_examples(self): return self._create_examples(self.train_path, set_type="train") def get_val_examples(self): return self._create_examples(self.val_path, set_type="val") def get_test_examples(self): return self._create_examples(self.test_path, set_type="test") @classmethod def _create_examples(cls, path, set_type): df_names = [ "#id", "warrant0", "warrant1", "gold_label", "reason", "claim", "debateTitle", "debateInfo", ] df = pd.read_csv( path, sep="\t", header=0, names=df_names, ) choice_pre = "And since " examples = [] for i, row in enumerate(df.itertuples()): # Repo explanation from https://github.com/UKPLab/argument-reasoning-comprehension-task examples.append( Example( guid="%s-%s" % (set_type, i), prompt=row.reason + " ", choice_list=[ choice_pre + row.warrant0 + ", " + row.claim, choice_pre + row.warrant1 + ", " + row.claim, ], label=row.gold_label if set_type != "test" else cls.CHOICE_KEYS[-1], )) return examples
class CopaTask(SuperGlueMixin, mc_template.AbstractMultipleChoiceTask): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch CHOICE_KEYS = [0, 1] CHOICE_TO_ID, ID_TO_CHOICE = labels_to_bimap(CHOICE_KEYS) NUM_CHOICES = len(CHOICE_KEYS) _QUESTION_DICT = { "cause": "What was the cause of this?", "effect": "What happened as a result?", } def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] for line in lines: question = cls._QUESTION_DICT[line["question"]] examples.append( Example( # NOTE: CopaTask.super_glue_format_preds() is dependent on this guid format. guid="%s-%s" % (set_type, line["idx"]), prompt=line["premise"] + " " + question, choice_list=[line["choice1"], line["choice2"]], label=line["label"] if set_type != "test" else cls.CHOICE_KEYS[-1], )) return examples @classmethod def super_glue_format_preds(cls, pred_dict): """Reformat this task's raw predictions to have the structure expected by SuperGLUE.""" lines = [] for pred, guid in zip(list(pred_dict["preds"]), list(pred_dict["guids"])): lines.append({ "idx": int(guid.split("-")[1]), "label": cls.CHOICE_KEYS[pred] }) return lines
class SocialIQATask(mc_template.AbstractMultipleChoiceTask): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch CHOICE_KEYS = ["A", "B", "C"] CHOICE_TO_ID, ID_TO_CHOICE = labels_to_bimap(CHOICE_KEYS) NUM_CHOICES = len(CHOICE_KEYS) def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): examples = [] answer_key_ls = ["answerA", "answerB", "answerC"] hf_datasets_label_map = { "1\n": "A", "2\n": "B", "3\n": "C", } for i, line in enumerate(lines): if "label" in line: # Loading from HF Datasets data label = hf_datasets_label_map[line["label"]] else: # Loading from original data label = line["correct"] examples.append( Example( guid="%s-%s" % (set_type, i), prompt=line["context"] + " " + line["question"], choice_list=[ line[answer_key] for answer_key in answer_key_ls ], label=label, )) return examples @classmethod def _read_labels(cls, path): lines = read_file_lines(path) return [int(line.strip()) for line in lines]
class MCTestTask(mc_template.AbstractMultipleChoiceTask): Example = Example TokenizedExample = TokenizedExample DataRow = DataRow Batch = Batch CHOICE_KEYS = ["A", "B", "C", "D"] CHOICE_TO_ID, ID_TO_CHOICE = labels_to_bimap(CHOICE_KEYS) NUM_CHOICES = len(CHOICE_KEYS) def get_train_examples(self): return self._create_examples( lines=read_file_lines(self.train_path, strip_lines=True), ans_lines=read_file_lines(self.path_dict["train_ans"], strip_lines=True), set_type="train", ) def get_val_examples(self): return self._create_examples( lines=read_file_lines(self.val_path, strip_lines=True), ans_lines=read_file_lines(self.path_dict["val_ans"], strip_lines=True), set_type="val", ) def get_test_examples(self): return self._create_examples( lines=read_file_lines(self.test_path, strip_lines=True), ans_lines=None, set_type="test", ) @classmethod def _create_examples(cls, lines, ans_lines, set_type): examples = [] if ans_lines is None: ans_lines = [ "\t".join([cls.CHOICE_KEYS[-1]] * 4) for line in lines ] for i, (line, ans) in enumerate(zip(lines, ans_lines)): line = line.split("\t") ans = ans.split("\t") for j in range(4): examples.append( Example( guid="%s-%s" % (set_type, i * 4 + j), prompt=line[2].replace("\\newline", " ") + " " + line[3 + j * 5], choice_list=line[4 + j * 5:8 + j * 5], label=ans[j], )) return examples
class ArcChallengeTask(mc_template.AbstractMultipleChoiceTask): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch CHOICE_KEYS = ["A", "B", "C", "D", "E"] CHOICE_TO_ID, ID_TO_CHOICE = labels_to_bimap(CHOICE_KEYS) NUM_CHOICES = len(CHOICE_KEYS) def get_train_examples(self): return self._create_examples(lines=read_json_lines(self.train_path), set_type="train") def get_val_examples(self): return self._create_examples(lines=read_json_lines(self.val_path), set_type="val") def get_test_examples(self): return self._create_examples(lines=read_json_lines(self.test_path), set_type="test") @classmethod def _create_examples(cls, lines, set_type): potential_label_map = { "1": "A", "2": "B", "3": "C", "4": "D", "5": "E", } NUM_CHOICES = len(potential_label_map) examples = [] for i, line in enumerate(lines): label = line["answerKey"] if label in potential_label_map: label = potential_label_map[label] # choice_list = [d["text"] for d in line["choices"]] choice_list = [d for d in line["choices"]["text"]] filler_choice_list = [ "." for i in range(NUM_CHOICES - len(choice_list)) ] choice_list = choice_list + filler_choice_list assert len(choice_list) == NUM_CHOICES examples.append( Example( guid="%s-%s" % (set_type, i), prompt=line["question"], choice_list=choice_list, label=label, )) return examples
class PiqaTask(mc_template.AbstractMultipleChoiceTask): Example = Example TokenizedExample = Example DataRow = DataRow Batch = Batch CHOICE_KEYS = [0, 1] CHOICE_TO_ID, ID_TO_CHOICE = labels_to_bimap(CHOICE_KEYS) NUM_CHOICES = len(CHOICE_KEYS) def get_train_examples(self): return self._create_examples( lines=zip( read_json_lines(self.train_path), read_file_lines(self.path_dict["train_labels"], strip_lines=True), ), set_type="train", ) def get_val_examples(self): return self._create_examples( lines=zip( read_json_lines(self.val_path), read_file_lines(self.path_dict["val_labels"], strip_lines=True), ), set_type="val", ) def get_test_examples(self): return self._create_examples( lines=zip(read_json_lines(self.test_path), read_json_lines(self.test_path)), set_type="test", ) @classmethod def _create_examples(cls, lines, set_type): examples = [] for i, (ex, label_string) in enumerate(lines): examples.append( Example( guid="%s-%s" % (set_type, i), prompt=ex["goal"], choice_list=[ex["sol1"], ex["sol2"]], label=int(label_string) if set_type != "test" else cls.CHOICE_KEYS[-1], )) return examples