def download_winogrande_data_and_write_config( task_name: str, task_data_path: str, task_config_path: str ): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( "https://storage.googleapis.com/ai2-mosaic/public/winogrande/winogrande_1.1.zip", task_data_path, ) task_data_path = os.path.join(task_data_path, "winogrande_1.1") py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train_xl.jsonl"), "train_labels": os.path.join(task_data_path, "train_xl-labels.lst"), "train_xs": os.path.join(task_data_path, "train_xs.jsonl"), "train_xs_labels": os.path.join(task_data_path, "train_xs-labels.lst"), "train_s": os.path.join(task_data_path, "train_s.jsonl"), "train_s_labels": os.path.join(task_data_path, "train_s-labels.lst"), "train_m": os.path.join(task_data_path, "train_m.jsonl"), "train_m_labels": os.path.join(task_data_path, "train_m-labels.lst"), "train_l": os.path.join(task_data_path, "train_l.jsonl"), "train_l_labels": os.path.join(task_data_path, "train_l-labels.lst"), "train_xl": os.path.join(task_data_path, "train_xl.jsonl"), "train_xl_labels": os.path.join(task_data_path, "train_xl-labels.lst"), "val": os.path.join(task_data_path, "dev.jsonl"), "val_labels": os.path.join(task_data_path, "dev-labels.lst"), "test": os.path.join(task_data_path, "test.jsonl"), }, "name": task_name, }, path=task_config_path, )
def download_swag_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( "https://github.com/rowanz/swagaf/archive/master.zip", task_data_path, ) for phase in ["train", "val", "test"]: os.rename( os.path.join(task_data_path, "swagaf-master", "data", f"{phase}.csv"), os.path.join(task_data_path, f"{phase}.csv"), ) shutil.rmtree(os.path.join(task_data_path, "swagaf-master")) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.csv"), "val": os.path.join(task_data_path, "val.csv"), "test": os.path.join(task_data_path, "test.csv"), }, "name": task_name, }, path=task_config_path, )
def download_qamr_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( "https://github.com/uwnlp/qamr/archive/master.zip", task_data_path, ) data_phase_list = ["train", "dev", "test"] jiant_phase_list = ["train", "val", "test"] for data_phase, jiant_phase in zip(data_phase_list, jiant_phase_list): os.rename( os.path.join(task_data_path, "qamr-master", "data", "filtered", f"{data_phase}.tsv"), os.path.join(task_data_path, f"{jiant_phase}.tsv"), ) os.rename( os.path.join(task_data_path, "qamr-master", "data", "wiki-sentences.tsv"), os.path.join(task_data_path, "wiki-sentences.tsv"), ) shutil.rmtree(os.path.join(task_data_path, "qamr-master")) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.tsv"), "val": os.path.join(task_data_path, "val.tsv"), "test": os.path.join(task_data_path, "test.tsv"), "wiki_dict": os.path.join(task_data_path, "wiki-sentences.tsv"), }, "name": task_name, }, path=task_config_path, )
def download_mlqa_data_and_write_config(task_data_base_path: str, task_config_base_path: str): mlqa_temp_path = py_io.create_dir(task_data_base_path, "mlqa_temp") download_utils.download_and_unzip( "https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip", mlqa_temp_path, ) languages = "ar de en es hi vi zh".split() for lang1, lang2 in itertools.product(languages, languages): task_name = f"mlqa_{lang1}_{lang2}" task_data_path = py_io.create_dir(task_data_base_path, task_name) val_path = os.path.join(task_data_path, f"dev-context-{lang1}-question-{lang2}.json") os.rename( src=os.path.join( mlqa_temp_path, "MLQA_V1", "dev", f"dev-context-{lang1}-question-{lang2}.json" ), dst=val_path, ) test_path = os.path.join(task_data_path, f"test-context-{lang1}-question-{lang2}.json") os.rename( src=os.path.join( mlqa_temp_path, "MLQA_V1", "test", f"test-context-{lang1}-question-{lang2}.json" ), dst=test_path, ) py_io.write_json( data={ "task": "mlqa", "paths": {"val": val_path, "test": test_path}, "kwargs": {"context_language": lang1, "question_language": lang2}, "name": task_name, }, path=os.path.join(task_config_base_path, f"{task_name}_config.json"), ) shutil.rmtree(mlqa_temp_path)
def download_xnli_data_and_write_config(task_data_base_path: str, task_config_base_path: str): xnli_temp_path = py_io.create_dir(task_data_base_path, "xnli_temp") download_utils.download_and_unzip( "https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip", xnli_temp_path, ) full_val_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.dev.jsonl")) val_data = datastructures.group_by(full_val_data, key_func=lambda elem: elem["language"]) full_test_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.test.jsonl")) test_data = datastructures.group_by(full_test_data, lambda elem: elem["language"]) languages = sorted(list(val_data)) for lang in languages: task_name = f"xnli_{lang}" task_data_path = py_io.create_dir(task_data_base_path, task_name) val_path = os.path.join(task_data_path, "val.jsonl") test_path = os.path.join(task_data_path, "test.jsonl") py_io.write_jsonl(data=val_data[lang], path=val_path) py_io.write_jsonl(data=test_data[lang], path=test_path) py_io.write_json( data={ "task": "xnli", "paths": {"val": val_path, "test": test_path}, "name": task_name, "kwargs": {"language": lang}, }, path=os.path.join(task_config_base_path, f"{task_name}_config.json"), ) shutil.rmtree(xnli_temp_path)
def download_fever_nli_data_and_write_config( task_name: str, task_data_path: str, task_config_path: str ): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( ("https://www.dropbox.com/s/hylbuaovqwo2zav/nli_fever.zip?dl=1"), task_data_path, ) # Since the FEVER NLI dataset doesn't have labels for the dev set, we also download the original # FEVER dev set and match example CIDs to obtain labels. orig_dev_path = os.path.join(task_data_path, "fever-dev-temp.jsonl") download_utils.download_file( "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl", orig_dev_path, ) id_to_label = {} for line in py_io.read_jsonl(orig_dev_path): if "id" not in line: logging.warning("FEVER dev dataset is missing ID.") continue if "label" not in line: logging.warning("FEVER dev dataset is missing label.") continue id_to_label[line["id"]] = line["label"] os.remove(orig_dev_path) dev_path = os.path.join(task_data_path, "nli_fever", "dev_fitems.jsonl") dev_examples = [] for line in py_io.read_jsonl(dev_path): if "cid" not in line: logging.warning("Data in {} is missing CID.".format(dev_path)) continue if int(line["cid"]) not in id_to_label: logging.warning("Could not match CID {} to dev data.".format(line["cid"])) continue dev_example = line dev_example["label"] = id_to_label[int(line["cid"])] dev_examples.append(dev_example) py_io.write_jsonl(dev_examples, os.path.join(task_data_path, "val.jsonl")) os.remove(dev_path) for phase in ["train", "test"]: os.rename( os.path.join(task_data_path, "nli_fever", f"{phase}_fitems.jsonl"), os.path.join(task_data_path, f"{phase}.jsonl"), ) shutil.rmtree(os.path.join(task_data_path, "nli_fever")) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.jsonl"), "val": os.path.join(task_data_path, "val.jsonl"), "test": os.path.join(task_data_path, "test.jsonl"), }, "name": task_name, }, path=task_config_path, )
def download_superglue_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str): # Applies to ReCoRD, MultiRC and WSC assert task_name not in SQUAD_TASKS os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( DIRECT_SUPERGLUE_TASKS_TO_DATA_URLS[task_name], task_data_path) # Move task data up one folder (nested under task name when unzipped) # ie: mv ./record/ReCoRD/* ./record nested_task_dir = os.path.join( task_data_path, filesystem.find_case_insensitive_filename(task_name, task_data_path)) task_data_files = os.listdir(nested_task_dir) for f in task_data_files: # Overwrite file if it exists (overwrite by full path specification) shutil.move(os.path.join(nested_task_dir, f), os.path.join(task_data_path, f)) shutil.rmtree(nested_task_dir) # Supports datasets with non-standard dev dataset name if os.path.isfile(os.path.join(task_data_path, "dev.jsonl")): dev_data_name = "dev.jsonl" elif os.path.isfile(os.path.join(task_data_path, "val.jsonl")): dev_data_name = "val.jsonl" else: raise RuntimeError("Unsupported dev dataset name in downloaded task.") val_path = os.path.join(task_data_path, dev_data_name) train_path = os.path.join(task_data_path, "train.jsonl") test_path = os.path.join(task_data_path, "test.jsonl") py_io.write_json( data={ "task": task_name, "paths": { "train": train_path, "val": val_path, "test": test_path }, "name": task_name, }, path=task_config_path, )
def download_mctest500_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( "https://mattr1.github.io/mctest/data/MCTest.zip", task_data_path, ) download_utils.download_and_unzip( "https://mattr1.github.io/mctest/data/MCTestAnswers.zip", task_data_path, ) os.rename( os.path.join(task_data_path, "MCTestAnswers", f"mc500.test.ans"), os.path.join(task_data_path, "MCTest", f"mc500.test.ans"), ) shutil.rmtree(os.path.join(task_data_path, "MCTestAnswers")) for phase in ["train", "dev", "test"]: os.rename( os.path.join(task_data_path, "MCTest", f"mc500.{phase}.tsv"), os.path.join(task_data_path, f"mc500.{phase}.tsv"), ) os.rename( os.path.join(task_data_path, "MCTest", f"mc500.{phase}.ans"), os.path.join(task_data_path, f"mc500.{phase}.ans"), ) shutil.rmtree(os.path.join(task_data_path, "MCTest")) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "mc500.train.tsv"), "train_ans": os.path.join(task_data_path, "mc500.train.ans"), "val": os.path.join(task_data_path, "mc500.dev.tsv"), "val_ans": os.path.join(task_data_path, "mc500.dev.ans"), "test": os.path.join(task_data_path, "mc500.test.tsv"), "test_ans": os.path.join(task_data_path, "mc500.test.ans"), }, "name": task_name, }, path=task_config_path, )
def download_abductive_nli_data_and_write_config( task_name: str, task_data_path: str, task_config_path: str ): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( "https://storage.googleapis.com/ai2-mosaic/public/alphanli/alphanli-train-dev.zip", task_data_path, ) py_io.write_json( data={ "task": task_name, "paths": { "train_inputs": os.path.join(task_data_path, "train.jsonl"), "train_labels": os.path.join(task_data_path, "train-labels.lst"), "val_inputs": os.path.join(task_data_path, "dev.jsonl"), "val_labels": os.path.join(task_data_path, "dev-labels.lst"), }, "name": task_name, }, path=task_config_path, )
def download_tatoeba_data_and_write_config(task_data_base_path: str, task_config_base_path: str): tatoeba_temp_path = py_io.create_dir(task_data_base_path, "tatoeba_temp") download_utils.download_and_unzip( "https://github.com/facebookresearch/LASER/archive/master.zip", tatoeba_temp_path, ) languages_dict = { "afr": "af", "ara": "ar", "bul": "bg", "ben": "bn", "deu": "de", "ell": "el", "spa": "es", "est": "et", "eus": "eu", "pes": "fa", "fin": "fi", "fra": "fr", "heb": "he", "hin": "hi", "hun": "hu", "ind": "id", "ita": "it", "jpn": "ja", "jav": "jv", "kat": "ka", "kaz": "kk", "kor": "ko", "mal": "ml", "mar": "mr", "nld": "nl", "por": "pt", "rus": "ru", "swh": "sw", "tam": "ta", "tel": "te", "tha": "th", "tgl": "tl", "tur": "tr", "urd": "ur", "vie": "vi", "cmn": "zh", "eng": "en", } raw_base_path = os.path.join(tatoeba_temp_path, "LASER-master", "data", "tatoeba", "v1") for full_lang, lang in languages_dict.items(): task_name = f"tatoeba_{lang}" if lang == "en": continue task_data_path = py_io.create_dir(task_data_base_path, task_name) eng_src = os.path.join(raw_base_path, f"tatoeba.{full_lang}-eng.eng") other_src = os.path.join(raw_base_path, f"tatoeba.{full_lang}-eng.{full_lang}") eng_out = os.path.join(task_data_path, f"{lang}-en.en") other_out = os.path.join(task_data_path, f"{lang}-en.{lang}") labels_out = os.path.join(task_data_path, f"{lang}-en.labels") tgts = [line.strip() for line in py_io.read_file_lines(eng_src)] os.rename(src=other_src, dst=other_out) idx = range(len(tgts)) data = zip(tgts, idx) # Tatoeba is a retrieval dataset where you have a set of sentences in English and another # set in another language, and you need to match them. It also doesn't have training # data, so it's pretty much evaluation only. However, the dataset is distributed with the # sentences in order, i.e. the retrieval pairing is the sentence order. # # The XTREME authors intentionally scramble the order by sorting one of the two # sets alphabetically. We're following their recipe, but also retaining the labels for # internal scoring. with py_io.get_lock(eng_out): with py_io.get_lock(labels_out): if os.path.exists(eng_out) and os.path.exists(labels_out): logger.info('Skip writing to %s since it already exists.', eng_out) logger.info('Skip writing to %s since it already exists.', labels_out) else: with open(eng_out, "w") as ftgt, open(labels_out, "w") as flabels: for t, i in sorted(data, key=lambda x: x[0]): ftgt.write(f"{t}\n") flabels.write(f"{i}\n") py_io.write_json( data={ "task": "tatoeba", "paths": { "eng": eng_out, "other": other_out, "labels_path": labels_out }, "kwargs": { "language": lang }, "name": task_name, }, path=os.path.join(task_config_base_path, f"{task_name}_config.json"), skip_if_exists=True, ) shutil.rmtree(tatoeba_temp_path)