Example #1
0
def download_winogrande_data_and_write_config(
    task_name: str, task_data_path: str, task_config_path: str
):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        "https://storage.googleapis.com/ai2-mosaic/public/winogrande/winogrande_1.1.zip",
        task_data_path,
    )

    task_data_path = os.path.join(task_data_path, "winogrande_1.1")

    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train_xl.jsonl"),
                "train_labels": os.path.join(task_data_path, "train_xl-labels.lst"),
                "train_xs": os.path.join(task_data_path, "train_xs.jsonl"),
                "train_xs_labels": os.path.join(task_data_path, "train_xs-labels.lst"),
                "train_s": os.path.join(task_data_path, "train_s.jsonl"),
                "train_s_labels": os.path.join(task_data_path, "train_s-labels.lst"),
                "train_m": os.path.join(task_data_path, "train_m.jsonl"),
                "train_m_labels": os.path.join(task_data_path, "train_m-labels.lst"),
                "train_l": os.path.join(task_data_path, "train_l.jsonl"),
                "train_l_labels": os.path.join(task_data_path, "train_l-labels.lst"),
                "train_xl": os.path.join(task_data_path, "train_xl.jsonl"),
                "train_xl_labels": os.path.join(task_data_path, "train_xl-labels.lst"),
                "val": os.path.join(task_data_path, "dev.jsonl"),
                "val_labels": os.path.join(task_data_path, "dev-labels.lst"),
                "test": os.path.join(task_data_path, "test.jsonl"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Example #2
0
def download_swag_data_and_write_config(task_name: str, task_data_path: str,
                                        task_config_path: str):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        "https://github.com/rowanz/swagaf/archive/master.zip",
        task_data_path,
    )
    for phase in ["train", "val", "test"]:
        os.rename(
            os.path.join(task_data_path, "swagaf-master", "data",
                         f"{phase}.csv"),
            os.path.join(task_data_path, f"{phase}.csv"),
        )
    shutil.rmtree(os.path.join(task_data_path, "swagaf-master"))
    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.csv"),
                "val": os.path.join(task_data_path, "val.csv"),
                "test": os.path.join(task_data_path, "test.csv"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Example #3
0
def download_qamr_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        "https://github.com/uwnlp/qamr/archive/master.zip", task_data_path,
    )
    data_phase_list = ["train", "dev", "test"]
    jiant_phase_list = ["train", "val", "test"]
    for data_phase, jiant_phase in zip(data_phase_list, jiant_phase_list):
        os.rename(
            os.path.join(task_data_path, "qamr-master", "data", "filtered", f"{data_phase}.tsv"),
            os.path.join(task_data_path, f"{jiant_phase}.tsv"),
        )
    os.rename(
        os.path.join(task_data_path, "qamr-master", "data", "wiki-sentences.tsv"),
        os.path.join(task_data_path, "wiki-sentences.tsv"),
    )
    shutil.rmtree(os.path.join(task_data_path, "qamr-master"))
    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.tsv"),
                "val": os.path.join(task_data_path, "val.tsv"),
                "test": os.path.join(task_data_path, "test.tsv"),
                "wiki_dict": os.path.join(task_data_path, "wiki-sentences.tsv"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Example #4
0
def download_mlqa_data_and_write_config(task_data_base_path: str, task_config_base_path: str):
    mlqa_temp_path = py_io.create_dir(task_data_base_path, "mlqa_temp")
    download_utils.download_and_unzip(
        "https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip", mlqa_temp_path,
    )
    languages = "ar de en es hi vi zh".split()
    for lang1, lang2 in itertools.product(languages, languages):
        task_name = f"mlqa_{lang1}_{lang2}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        val_path = os.path.join(task_data_path, f"dev-context-{lang1}-question-{lang2}.json")
        os.rename(
            src=os.path.join(
                mlqa_temp_path, "MLQA_V1", "dev", f"dev-context-{lang1}-question-{lang2}.json"
            ),
            dst=val_path,
        )
        test_path = os.path.join(task_data_path, f"test-context-{lang1}-question-{lang2}.json")
        os.rename(
            src=os.path.join(
                mlqa_temp_path, "MLQA_V1", "test", f"test-context-{lang1}-question-{lang2}.json"
            ),
            dst=test_path,
        )
        py_io.write_json(
            data={
                "task": "mlqa",
                "paths": {"val": val_path, "test": test_path},
                "kwargs": {"context_language": lang1, "question_language": lang2},
                "name": task_name,
            },
            path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
        )
    shutil.rmtree(mlqa_temp_path)
Example #5
0
def download_xnli_data_and_write_config(task_data_base_path: str, task_config_base_path: str):
    xnli_temp_path = py_io.create_dir(task_data_base_path, "xnli_temp")
    download_utils.download_and_unzip(
        "https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip", xnli_temp_path,
    )
    full_val_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.dev.jsonl"))
    val_data = datastructures.group_by(full_val_data, key_func=lambda elem: elem["language"])
    full_test_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.test.jsonl"))
    test_data = datastructures.group_by(full_test_data, lambda elem: elem["language"])
    languages = sorted(list(val_data))
    for lang in languages:
        task_name = f"xnli_{lang}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        val_path = os.path.join(task_data_path, "val.jsonl")
        test_path = os.path.join(task_data_path, "test.jsonl")
        py_io.write_jsonl(data=val_data[lang], path=val_path)
        py_io.write_jsonl(data=test_data[lang], path=test_path)
        py_io.write_json(
            data={
                "task": "xnli",
                "paths": {"val": val_path, "test": test_path},
                "name": task_name,
                "kwargs": {"language": lang},
            },
            path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
        )
    shutil.rmtree(xnli_temp_path)
Example #6
0
def download_fever_nli_data_and_write_config(
    task_name: str, task_data_path: str, task_config_path: str
):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        ("https://www.dropbox.com/s/hylbuaovqwo2zav/nli_fever.zip?dl=1"), task_data_path,
    )
    # Since the FEVER NLI dataset doesn't have labels for the dev set, we also download the original
    # FEVER dev set and match example CIDs to obtain labels.
    orig_dev_path = os.path.join(task_data_path, "fever-dev-temp.jsonl")
    download_utils.download_file(
        "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl", orig_dev_path,
    )
    id_to_label = {}
    for line in py_io.read_jsonl(orig_dev_path):
        if "id" not in line:
            logging.warning("FEVER dev dataset is missing ID.")
            continue
        if "label" not in line:
            logging.warning("FEVER dev dataset is missing label.")
            continue
        id_to_label[line["id"]] = line["label"]
    os.remove(orig_dev_path)

    dev_path = os.path.join(task_data_path, "nli_fever", "dev_fitems.jsonl")
    dev_examples = []
    for line in py_io.read_jsonl(dev_path):
        if "cid" not in line:
            logging.warning("Data in {} is missing CID.".format(dev_path))
            continue
        if int(line["cid"]) not in id_to_label:
            logging.warning("Could not match CID {} to dev data.".format(line["cid"]))
            continue
        dev_example = line
        dev_example["label"] = id_to_label[int(line["cid"])]
        dev_examples.append(dev_example)
    py_io.write_jsonl(dev_examples, os.path.join(task_data_path, "val.jsonl"))
    os.remove(dev_path)

    for phase in ["train", "test"]:
        os.rename(
            os.path.join(task_data_path, "nli_fever", f"{phase}_fitems.jsonl"),
            os.path.join(task_data_path, f"{phase}.jsonl"),
        )
    shutil.rmtree(os.path.join(task_data_path, "nli_fever"))

    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.jsonl"),
                "val": os.path.join(task_data_path, "val.jsonl"),
                "test": os.path.join(task_data_path, "test.jsonl"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Example #7
0
def download_superglue_data_and_write_config(task_name: str,
                                             task_data_path: str,
                                             task_config_path: str):
    # Applies to ReCoRD, MultiRC and WSC
    assert task_name not in SQUAD_TASKS

    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        DIRECT_SUPERGLUE_TASKS_TO_DATA_URLS[task_name], task_data_path)

    # Move task data up one folder (nested under task name when unzipped)
    # ie: mv ./record/ReCoRD/* ./record
    nested_task_dir = os.path.join(
        task_data_path,
        filesystem.find_case_insensitive_filename(task_name, task_data_path))
    task_data_files = os.listdir(nested_task_dir)
    for f in task_data_files:
        # Overwrite file if it exists (overwrite by full path specification)
        shutil.move(os.path.join(nested_task_dir, f),
                    os.path.join(task_data_path, f))
    shutil.rmtree(nested_task_dir)

    # Supports datasets with non-standard dev dataset name
    if os.path.isfile(os.path.join(task_data_path, "dev.jsonl")):
        dev_data_name = "dev.jsonl"
    elif os.path.isfile(os.path.join(task_data_path, "val.jsonl")):
        dev_data_name = "val.jsonl"
    else:
        raise RuntimeError("Unsupported dev dataset name in downloaded task.")

    val_path = os.path.join(task_data_path, dev_data_name)
    train_path = os.path.join(task_data_path, "train.jsonl")
    test_path = os.path.join(task_data_path, "test.jsonl")
    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": train_path,
                "val": val_path,
                "test": test_path
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Example #8
0
def download_mctest500_data_and_write_config(task_name: str,
                                             task_data_path: str,
                                             task_config_path: str):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        "https://mattr1.github.io/mctest/data/MCTest.zip",
        task_data_path,
    )
    download_utils.download_and_unzip(
        "https://mattr1.github.io/mctest/data/MCTestAnswers.zip",
        task_data_path,
    )
    os.rename(
        os.path.join(task_data_path, "MCTestAnswers", f"mc500.test.ans"),
        os.path.join(task_data_path, "MCTest", f"mc500.test.ans"),
    )
    shutil.rmtree(os.path.join(task_data_path, "MCTestAnswers"))
    for phase in ["train", "dev", "test"]:
        os.rename(
            os.path.join(task_data_path, "MCTest", f"mc500.{phase}.tsv"),
            os.path.join(task_data_path, f"mc500.{phase}.tsv"),
        )
        os.rename(
            os.path.join(task_data_path, "MCTest", f"mc500.{phase}.ans"),
            os.path.join(task_data_path, f"mc500.{phase}.ans"),
        )
    shutil.rmtree(os.path.join(task_data_path, "MCTest"))

    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "mc500.train.tsv"),
                "train_ans": os.path.join(task_data_path, "mc500.train.ans"),
                "val": os.path.join(task_data_path, "mc500.dev.tsv"),
                "val_ans": os.path.join(task_data_path, "mc500.dev.ans"),
                "test": os.path.join(task_data_path, "mc500.test.tsv"),
                "test_ans": os.path.join(task_data_path, "mc500.test.ans"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Example #9
0
def download_abductive_nli_data_and_write_config(
    task_name: str, task_data_path: str, task_config_path: str
):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        "https://storage.googleapis.com/ai2-mosaic/public/alphanli/alphanli-train-dev.zip",
        task_data_path,
    )
    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train_inputs": os.path.join(task_data_path, "train.jsonl"),
                "train_labels": os.path.join(task_data_path, "train-labels.lst"),
                "val_inputs": os.path.join(task_data_path, "dev.jsonl"),
                "val_labels": os.path.join(task_data_path, "dev-labels.lst"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Example #10
0
def download_tatoeba_data_and_write_config(task_data_base_path: str,
                                           task_config_base_path: str):
    tatoeba_temp_path = py_io.create_dir(task_data_base_path, "tatoeba_temp")
    download_utils.download_and_unzip(
        "https://github.com/facebookresearch/LASER/archive/master.zip",
        tatoeba_temp_path,
    )
    languages_dict = {
        "afr": "af",
        "ara": "ar",
        "bul": "bg",
        "ben": "bn",
        "deu": "de",
        "ell": "el",
        "spa": "es",
        "est": "et",
        "eus": "eu",
        "pes": "fa",
        "fin": "fi",
        "fra": "fr",
        "heb": "he",
        "hin": "hi",
        "hun": "hu",
        "ind": "id",
        "ita": "it",
        "jpn": "ja",
        "jav": "jv",
        "kat": "ka",
        "kaz": "kk",
        "kor": "ko",
        "mal": "ml",
        "mar": "mr",
        "nld": "nl",
        "por": "pt",
        "rus": "ru",
        "swh": "sw",
        "tam": "ta",
        "tel": "te",
        "tha": "th",
        "tgl": "tl",
        "tur": "tr",
        "urd": "ur",
        "vie": "vi",
        "cmn": "zh",
        "eng": "en",
    }
    raw_base_path = os.path.join(tatoeba_temp_path, "LASER-master", "data",
                                 "tatoeba", "v1")
    for full_lang, lang in languages_dict.items():
        task_name = f"tatoeba_{lang}"
        if lang == "en":
            continue
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        eng_src = os.path.join(raw_base_path, f"tatoeba.{full_lang}-eng.eng")
        other_src = os.path.join(raw_base_path,
                                 f"tatoeba.{full_lang}-eng.{full_lang}")
        eng_out = os.path.join(task_data_path, f"{lang}-en.en")
        other_out = os.path.join(task_data_path, f"{lang}-en.{lang}")
        labels_out = os.path.join(task_data_path, f"{lang}-en.labels")
        tgts = [line.strip() for line in py_io.read_file_lines(eng_src)]
        os.rename(src=other_src, dst=other_out)
        idx = range(len(tgts))
        data = zip(tgts, idx)

        # Tatoeba is a retrieval dataset where you have a set of sentences in English and another
        # set in another language, and you need to match them. It also doesn't have training
        # data, so it's pretty much evaluation only. However, the dataset is distributed with the
        # sentences in order, i.e. the retrieval pairing is the sentence order.
        #
        # The XTREME authors intentionally scramble the order by sorting one of the two
        # sets alphabetically. We're following their recipe, but also retaining the labels for
        # internal scoring.
        with py_io.get_lock(eng_out):
            with py_io.get_lock(labels_out):
                if os.path.exists(eng_out) and os.path.exists(labels_out):
                    logger.info('Skip writing to %s since it already exists.',
                                eng_out)
                    logger.info('Skip writing to %s since it already exists.',
                                labels_out)
                else:
                    with open(eng_out, "w") as ftgt, open(labels_out,
                                                          "w") as flabels:
                        for t, i in sorted(data, key=lambda x: x[0]):
                            ftgt.write(f"{t}\n")
                            flabels.write(f"{i}\n")
        py_io.write_json(
            data={
                "task": "tatoeba",
                "paths": {
                    "eng": eng_out,
                    "other": other_out,
                    "labels_path": labels_out
                },
                "kwargs": {
                    "language": lang
                },
                "name": task_name,
            },
            path=os.path.join(task_config_base_path,
                              f"{task_name}_config.json"),
            skip_if_exists=True,
        )
    shutil.rmtree(tatoeba_temp_path)