def download_xnli_data_and_write_config(task_data_base_path: str, task_config_base_path: str): xnli_temp_path = py_io.create_dir(task_data_base_path, "xnli_temp") download_utils.download_and_unzip( "https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip", xnli_temp_path, ) full_val_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.dev.jsonl")) val_data = datastructures.group_by(full_val_data, key_func=lambda elem: elem["language"]) full_test_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.test.jsonl")) test_data = datastructures.group_by(full_test_data, lambda elem: elem["language"]) languages = sorted(list(val_data)) for lang in languages: task_name = f"xnli_{lang}" task_data_path = py_io.create_dir(task_data_base_path, task_name) val_path = os.path.join(task_data_path, "val.jsonl") test_path = os.path.join(task_data_path, "test.jsonl") py_io.write_jsonl(data=val_data[lang], path=val_path) py_io.write_jsonl(data=test_data[lang], path=test_path) py_io.write_json( data={ "task": "xnli", "paths": {"val": val_path, "test": test_path}, "name": task_name, "kwargs": {"language": lang}, }, path=os.path.join(task_config_base_path, f"{task_name}_config.json"), ) shutil.rmtree(xnli_temp_path)
def download_fever_nli_data_and_write_config( task_name: str, task_data_path: str, task_config_path: str ): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( ("https://www.dropbox.com/s/hylbuaovqwo2zav/nli_fever.zip?dl=1"), task_data_path, ) # Since the FEVER NLI dataset doesn't have labels for the dev set, we also download the original # FEVER dev set and match example CIDs to obtain labels. orig_dev_path = os.path.join(task_data_path, "fever-dev-temp.jsonl") download_utils.download_file( "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl", orig_dev_path, ) id_to_label = {} for line in py_io.read_jsonl(orig_dev_path): if "id" not in line: logging.warning("FEVER dev dataset is missing ID.") continue if "label" not in line: logging.warning("FEVER dev dataset is missing label.") continue id_to_label[line["id"]] = line["label"] os.remove(orig_dev_path) dev_path = os.path.join(task_data_path, "nli_fever", "dev_fitems.jsonl") dev_examples = [] for line in py_io.read_jsonl(dev_path): if "cid" not in line: logging.warning("Data in {} is missing CID.".format(dev_path)) continue if int(line["cid"]) not in id_to_label: logging.warning("Could not match CID {} to dev data.".format(line["cid"])) continue dev_example = line dev_example["label"] = id_to_label[int(line["cid"])] dev_examples.append(dev_example) py_io.write_jsonl(dev_examples, os.path.join(task_data_path, "val.jsonl")) os.remove(dev_path) for phase in ["train", "test"]: os.rename( os.path.join(task_data_path, "nli_fever", f"{phase}_fitems.jsonl"), os.path.join(task_data_path, f"{phase}.jsonl"), ) shutil.rmtree(os.path.join(task_data_path, "nli_fever")) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.jsonl"), "val": os.path.join(task_data_path, "val.jsonl"), "test": os.path.join(task_data_path, "test.jsonl"), }, "name": task_name, }, path=task_config_path, )
def load_log(fol_path): all_paths = filesystem.find_files_with_ext(fol_path, "zlog") log_data = {} for path in all_paths: key = os.path.abspath(path).replace(os.path.abspath(fol_path), "")[1:].replace(".zlog", "") log_data[key] = py_io.read_jsonl(path) return log_data
def read_examples(cls, path: str, set_type: str): if path.endswith(".gz"): with gzip.open(path, "r") as f: data = [json.loads(row) for row in f.readlines()] elif path.endswith(".jsonl"): data = read_jsonl(path) else: raise KeyError(f"Unknown format: {path}") # First row is a header row assert "header" in data[0] examples = [] for i, line in enumerate(data[1:]): for elem in line["qas"]: if set_type == PHASE.TRAIN: # Just use first answer for training (e.g. "Car", "Vehicle") answer = elem["detected_answers"][0] # Just use first occurrence of answer for training ("Every car is a car.") answer_span = answer["char_spans"][0] answer_text = line["context"][ answer_span[0]:answer_span[1] + 1] # assert len(elem["detected_answers"][0]["char_spans"]) == 1 examples.append( Example( qas_id=f"{set_type}-{i}", question_text=elem["question"], context_text=line["context"], answer_text=answer_text, start_position_character=answer_span[0], title="", is_impossible=False, answers=[], )) else: answers = [] for answer in elem["detected_answers"]: for answer_span in answer["char_spans"]: answers.append({ "answer_start": answer_span[0], "text": line["context"][answer_span[0]:answer_span[1] + 1], }) examples.append( Example( qas_id=f"{set_type}-{i}", question_text=elem["question"], context_text=line["context"], answer_text=None, start_position_character=None, title="", is_impossible=False, answers=answers, )) return examples
def _create_examples(cls, path, set_type): for i, row in enumerate(py_io.read_jsonl(path)): yield Example( guid="%s-%s" % (set_type, i), tokenized_text=row["tokenized_text"], masked_spans=[ ExclusiveSpan(start, end) for start, end in row["masked_spans"] ], )
def default_get_test_labels(task: Task, possible_labels): """test setのラベルを獲得する. 本関数が動作するかは,サブクラスの実装による. 多くのサブクラスは(上位クラスで明示されない)同一のinterfaceを持っているため,この実装でうまくいく. """ labels = [ example.label for example in task._create_examples( read_jsonl(task.test_path), "valid") ] if any([label not in possible_labels for label in labels]): # masked labels labels = [possible_labels[-1]] * len(labels) return labels
def read_examples(cls, path: str, set_type: str): examples = [] for entry in read_jsonl(path): for qa in entry["qas"]: answer_text = entry["text"][ qa["answer"]["s"]:qa["answer"]["e"]] examples.append( Example( qas_id=f"{set_type}-{len(examples)}", question_text=qa["question"], context_text=entry["text"], answer_text=answer_text, start_position_character=qa["answer"]["s"], title="", is_impossible=False, answers=[{ "answer_start": qa["answer"]["s"], "text": answer_text }], )) return examples
def read_examples(cls, path: str, set_type: str): examples = [] for i, line in enumerate(read_jsonl(path)): if set_type == PHASE.TRAIN: for j, (answer_start, answer_text) in enumerate( zip(line["answers"]["answer_start"], line["answers"]["text"])): examples.append( Example( qas_id=f"{set_type}-{i}", question_text=line["question"], context_text=line["context"], answer_text=answer_text, start_position_character=answer_start, title=line["title"], is_impossible=False, answers=[], )) else: answers = [{ "answer_start": answer_start, "text": answer_text } for answer_start, answer_text in zip( line["answers"]["answer_start"], line["answers"]["text"])] examples.append( Example( qas_id=f"{set_type}-{i}", question_text=line["question"], context_text=line["context"], answer_text=None, start_position_character=None, title=line["title"], is_impossible=False, answers=answers, )) return examples
def get_test_examples(self): return self._create_examples( lines=read_jsonl(self.test_path), set_type="test", language=self.language, )
def get_val_examples(self): return self._create_examples( lines=read_jsonl(self.val_path), set_type="val", language=self.language, )
def get_val_examples(self): return self._create_examples(lines=read_jsonl(self.path_dict['validation']), set_type="val")
def get_train_examples(self): return self._create_examples(lines=read_jsonl(self.train_path), set_type="train")