Beispiel #1
0
def download_mlqa_data_and_write_config(task_data_base_path: str, task_config_base_path: str):
    mlqa_temp_path = py_io.create_dir(task_data_base_path, "mlqa_temp")
    download_utils.download_and_unzip(
        "https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip", mlqa_temp_path,
    )
    languages = "ar de en es hi vi zh".split()
    for lang1, lang2 in itertools.product(languages, languages):
        task_name = f"mlqa_{lang1}_{lang2}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        val_path = os.path.join(task_data_path, f"dev-context-{lang1}-question-{lang2}.json")
        os.rename(
            src=os.path.join(
                mlqa_temp_path, "MLQA_V1", "dev", f"dev-context-{lang1}-question-{lang2}.json"
            ),
            dst=val_path,
        )
        test_path = os.path.join(task_data_path, f"test-context-{lang1}-question-{lang2}.json")
        os.rename(
            src=os.path.join(
                mlqa_temp_path, "MLQA_V1", "test", f"test-context-{lang1}-question-{lang2}.json"
            ),
            dst=test_path,
        )
        py_io.write_json(
            data={
                "task": "mlqa",
                "paths": {"val": val_path, "test": test_path},
                "kwargs": {"context_language": lang1, "question_language": lang2},
                "name": task_name,
            },
            path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
        )
    shutil.rmtree(mlqa_temp_path)
Beispiel #2
0
def download_data(task_names, output_base_path):
    task_data_base_path = py_io.create_dir(output_base_path, "data")
    task_config_base_path = py_io.create_dir(output_base_path, "configs")

    assert set(task_names).issubset(SUPPORTED_TASKS)

    # Download specified tasks and generate configs for specified tasks
    for i, task_name in enumerate(task_names):
        task_data_path = os.path.join(task_data_base_path, task_name)

        if task_name in HF_DATASETS_TASKS:
            hf_datasets_tasks_download.download_data_and_write_config(
                task_name=task_name,
                task_data_path=task_data_path,
                task_config_path=os.path.join(task_config_base_path,
                                              f"{task_name}_config.json"),
            )
        elif task_name in XTREME_TASKS:
            xtreme_download.download_xtreme_data_and_write_config(
                task_name=task_name,
                task_data_base_path=task_data_base_path,
                task_config_base_path=task_config_base_path,
            )
        elif task_name in DIRECT_DOWNLOAD_TASKS:
            files_tasks_download.download_task_data_and_write_config(
                task_name=task_name,
                task_data_path=task_data_path,
                task_config_path=os.path.join(task_config_base_path,
                                              f"{task_name}_config.json"),
            )
        else:
            raise KeyError()
        print(
            f"Downloaded and generated configs for '{task_name}' ({i+1}/{len(task_names)})"
        )
Beispiel #3
0
def download_xnli_data_and_write_config(task_data_base_path: str, task_config_base_path: str):
    xnli_temp_path = py_io.create_dir(task_data_base_path, "xnli_temp")
    download_utils.download_and_unzip(
        "https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip", xnli_temp_path,
    )
    full_val_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.dev.jsonl"))
    val_data = datastructures.group_by(full_val_data, key_func=lambda elem: elem["language"])
    full_test_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.test.jsonl"))
    test_data = datastructures.group_by(full_test_data, lambda elem: elem["language"])
    languages = sorted(list(val_data))
    for lang in languages:
        task_name = f"xnli_{lang}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        val_path = os.path.join(task_data_path, "val.jsonl")
        test_path = os.path.join(task_data_path, "test.jsonl")
        py_io.write_jsonl(data=val_data[lang], path=val_path)
        py_io.write_jsonl(data=test_data[lang], path=test_path)
        py_io.write_json(
            data={
                "task": "xnli",
                "paths": {"val": val_path, "test": test_path},
                "name": task_name,
                "kwargs": {"language": lang},
            },
            path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
        )
    shutil.rmtree(xnli_temp_path)
Beispiel #4
0
def download_xquad_data_and_write_config(task_data_base_path: str,
                                         task_config_base_path: str):
    languages = "ar de el en es hi ru th tr vi zh".split()
    for lang in languages:
        task_name = f"xquad_{lang}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        path = os.path.join(task_data_path, "xquad.json")
        download_utils.download_file(
            url=
            f"https://raw.githubusercontent.com/deepmind/xquad/master/xquad.{lang}.json",
            file_path=path,
        )
        py_io.write_json(
            data={
                "task": "xquad",
                "paths": {
                    "val": path
                },
                "name": task_name,
                "kwargs": {
                    "language": lang
                },
            },
            path=os.path.join(task_config_base_path,
                              f"{task_name}_config.json"),
            skip_if_exists=True,
        )
Beispiel #5
0
def download_pawsx_data_and_write_config(task_data_base_path: str, task_config_base_path: str):
    pawsx_temp_path = py_io.create_dir(task_data_base_path, "pawsx_temp")
    download_utils.download_and_untar(
        "https://storage.googleapis.com/paws/pawsx/x-final.tar.gz", pawsx_temp_path,
    )
    languages = sorted(os.listdir(os.path.join(pawsx_temp_path, "x-final")))
    for lang in languages:
        task_name = f"pawsx_{lang}"
        os.rename(
            src=os.path.join(pawsx_temp_path, "x-final", lang),
            dst=os.path.join(task_data_base_path, task_name),
        )
        paths_dict = {
            "val": os.path.join(task_data_base_path, task_name, "dev_2k.tsv"),
            "test": os.path.join(task_data_base_path, task_name, "test_2k.tsv"),
        }
        if lang == "en":
            paths_dict["train"] = os.path.join(task_data_base_path, task_name, "train.tsv")
            datastructures.set_dict_keys(paths_dict, ["train", "val", "test"])
        py_io.write_json(
            data={
                "task": "pawsx",
                "paths": paths_dict,
                "name": task_name,
                "kwargs": {"language": lang},
            },
            path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
        )
    shutil.rmtree(pawsx_temp_path)
Beispiel #6
0
def download_tatoeba_data_and_write_config(task_data_base_path: str,
                                           task_config_base_path: str):
    tatoeba_temp_path = py_io.create_dir(task_data_base_path, "tatoeba_temp")
    download_utils.download_and_unzip(
        "https://github.com/facebookresearch/LASER/archive/master.zip",
        tatoeba_temp_path,
    )
    languages_dict = {
        "afr": "af",
        "ara": "ar",
        "bul": "bg",
        "ben": "bn",
        "deu": "de",
        "ell": "el",
        "spa": "es",
        "est": "et",
        "eus": "eu",
        "pes": "fa",
        "fin": "fi",
        "fra": "fr",
        "heb": "he",
        "hin": "hi",
        "hun": "hu",
        "ind": "id",
        "ita": "it",
        "jpn": "ja",
        "jav": "jv",
        "kat": "ka",
        "kaz": "kk",
        "kor": "ko",
        "mal": "ml",
        "mar": "mr",
        "nld": "nl",
        "por": "pt",
        "rus": "ru",
        "swh": "sw",
        "tam": "ta",
        "tel": "te",
        "tha": "th",
        "tgl": "tl",
        "tur": "tr",
        "urd": "ur",
        "vie": "vi",
        "cmn": "zh",
        "eng": "en",
    }
    raw_base_path = os.path.join(tatoeba_temp_path, "LASER-master", "data",
                                 "tatoeba", "v1")
    for full_lang, lang in languages_dict.items():
        task_name = f"tatoeba_{lang}"
        if lang == "en":
            continue
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        eng_src = os.path.join(raw_base_path, f"tatoeba.{full_lang}-eng.eng")
        other_src = os.path.join(raw_base_path,
                                 f"tatoeba.{full_lang}-eng.{full_lang}")
        eng_out = os.path.join(task_data_path, f"{lang}-en.en")
        other_out = os.path.join(task_data_path, f"{lang}-en.{lang}")
        labels_out = os.path.join(task_data_path, f"{lang}-en.labels")
        tgts = [line.strip() for line in py_io.read_file_lines(eng_src)]
        os.rename(src=other_src, dst=other_out)
        idx = range(len(tgts))
        data = zip(tgts, idx)

        # Tatoeba is a retrieval dataset where you have a set of sentences in English and another
        # set in another language, and you need to match them. It also doesn't have training
        # data, so it's pretty much evaluation only. However, the dataset is distributed with the
        # sentences in order, i.e. the retrieval pairing is the sentence order.
        #
        # The XTREME authors intentionally scramble the order by sorting one of the two
        # sets alphabetically. We're following their recipe, but also retaining the labels for
        # internal scoring.
        with py_io.get_lock(eng_out):
            with py_io.get_lock(labels_out):
                if os.path.exists(eng_out) and os.path.exists(labels_out):
                    logger.info('Skip writing to %s since it already exists.',
                                eng_out)
                    logger.info('Skip writing to %s since it already exists.',
                                labels_out)
                else:
                    with open(eng_out, "w") as ftgt, open(labels_out,
                                                          "w") as flabels:
                        for t, i in sorted(data, key=lambda x: x[0]):
                            ftgt.write(f"{t}\n")
                            flabels.write(f"{i}\n")
        py_io.write_json(
            data={
                "task": "tatoeba",
                "paths": {
                    "eng": eng_out,
                    "other": other_out,
                    "labels_path": labels_out
                },
                "kwargs": {
                    "language": lang
                },
                "name": task_name,
            },
            path=os.path.join(task_config_base_path,
                              f"{task_name}_config.json"),
            skip_if_exists=True,
        )
    shutil.rmtree(tatoeba_temp_path)
Beispiel #7
0
def download_bucc2018_data_and_write_config(task_data_base_path: str,
                                            task_config_base_path: str):
    bucc2018_temp_path = py_io.create_dir(task_data_base_path, "bucc_temp")
    languages = "de fr ru zh".split()
    for lang in languages:
        download_utils.download_and_untar(
            f"https://comparable.limsi.fr/bucc2018/bucc2018-{lang}-en.training-gold.tar.bz2",
            bucc2018_temp_path,
        )
        download_utils.download_and_untar(
            f"https://comparable.limsi.fr/bucc2018/bucc2018-{lang}-en.sample-gold.tar.bz2",
            bucc2018_temp_path,
        )
    for lang in languages:
        task_name = f"bucc2018_{lang}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        val_eng_path = os.path.join(task_data_path, f"{lang}-en.dev.en")
        val_other_path = os.path.join(task_data_path, f"{lang}-en.dev.{lang}")
        val_labels_path = os.path.join(task_data_path, f"{lang}-en.dev.gold")
        test_eng_path = os.path.join(task_data_path, f"{lang}-en.test.en")
        test_other_path = os.path.join(task_data_path,
                                       f"{lang}-en.test.{lang}")
        # sample -> dev
        # training -> test (yup, it's weird)
        os.rename(
            src=os.path.join(bucc2018_temp_path, "bucc2018", f"{lang}-en",
                             f"{lang}-en.sample.en"),
            dst=val_eng_path,
        )
        os.rename(
            src=os.path.join(bucc2018_temp_path, "bucc2018", f"{lang}-en",
                             f"{lang}-en.sample.{lang}"),
            dst=val_other_path,
        )
        os.rename(
            src=os.path.join(bucc2018_temp_path, "bucc2018", f"{lang}-en",
                             f"{lang}-en.sample.gold"),
            dst=val_labels_path,
        )
        os.rename(
            src=os.path.join(bucc2018_temp_path, "bucc2018", f"{lang}-en",
                             f"{lang}-en.training.en"),
            dst=test_eng_path,
        )
        os.rename(
            src=os.path.join(bucc2018_temp_path, "bucc2018", f"{lang}-en",
                             f"{lang}-en.training.{lang}"),
            dst=test_other_path,
        )
        py_io.write_json(
            data={
                "task": "bucc2018",
                "paths": {
                    "val": {
                        "eng": val_eng_path,
                        "other": val_other_path,
                        "labels": val_labels_path,
                    },
                    "test": {
                        "eng": test_eng_path,
                        "other": test_other_path
                    },
                },
                "kwargs": {
                    "language": lang
                },
                "name": task_name,
            },
            path=os.path.join(task_config_base_path,
                              f"{task_name}_config.json"),
            skip_if_exists=True,
        )
    shutil.rmtree(bucc2018_temp_path)
Beispiel #8
0
def download_tydiqa_data_and_write_config(task_data_base_path: str,
                                          task_config_base_path: str):
    tydiqa_temp_path = py_io.create_dir(task_data_base_path, "tydiqa_temp")
    full_train_path = os.path.join(tydiqa_temp_path,
                                   "tydiqa-goldp-v1.1-train.json")
    download_utils.download_file(
        "https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-train.json",
        full_train_path,
    )
    download_utils.download_and_untar(
        "https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-dev.tgz",
        tydiqa_temp_path,
    )
    languages_dict = {
        "arabic": "ar",
        "bengali": "bn",
        "english": "en",
        "finnish": "fi",
        "indonesian": "id",
        "korean": "ko",
        "russian": "ru",
        "swahili": "sw",
        "telugu": "te",
    }

    # Split train data
    data = py_io.read_json(full_train_path)
    lang2data = {lang: [] for lang in languages_dict.values()}
    for doc in data["data"]:
        for par in doc["paragraphs"]:
            context = par["context"]
            for qa in par["qas"]:
                question = qa["question"]
                question_id = qa["id"]
                example_lang = languages_dict[question_id.split("-")[0]]
                q_id = question_id.split("-")[-1]
                for answer in qa["answers"]:
                    a_start, a_text = answer["answer_start"], answer["text"]
                    a_end = a_start + len(a_text)
                    assert context[a_start:a_end] == a_text
                lang2data[example_lang].append({
                    "paragraphs": [{
                        "context":
                        context,
                        "qas": [{
                            "answers": qa["answers"],
                            "question": question,
                            "id": q_id
                        }],
                    }]
                })

    for full_lang, lang in languages_dict.items():
        task_name = f"tydiqa_{lang}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        train_path = os.path.join(task_data_path, f"tydiqa.{lang}.train.json")
        py_io.write_json(
            data=data,
            path=train_path,
            skip_if_exists=True,
        )
        val_path = os.path.join(task_data_path, f"tydiqa.{lang}.dev.json")
        os.rename(
            src=os.path.join(tydiqa_temp_path, "tydiqa-goldp-v1.1-dev",
                             f"tydiqa-goldp-dev-{full_lang}.json"),
            dst=val_path,
        )
        py_io.write_json(
            data={
                "task": "tydiqa",
                "paths": {
                    "train": train_path,
                    "val": val_path
                },
                "kwargs": {
                    "language": lang
                },
                "name": task_name,
            },
            path=os.path.join(task_config_base_path,
                              f"{task_name}_config.json"),
            skip_if_exists=True,
        )
    shutil.rmtree(tydiqa_temp_path)
Beispiel #9
0
def download_udpos_data_and_write_config(task_data_base_path: str,
                                         task_config_base_path: str):
    # UDPOS requires networkx==1.11

    def _read_one_file(file):
        # Adapted from https://github.com/JunjieHu/xtreme/blob/
        #              9fe0b142d0ee3eb7dd047ab86f12a76702e79bb4/utils_preprocess.py
        data = []
        sent, tag, lines = [], [], []
        for line in open(file, "r"):
            items = line.strip().split("\t")
            if len(items) != 10:
                num_empty = sum([int(w == "_") for w in sent])
                if num_empty == 0 or num_empty < len(sent) - 1:
                    data.append((sent, tag, lines))
                sent, tag, lines = [], [], []
            else:
                sent.append(items[1].strip())
                tag.append(items[3].strip())
                lines.append(line.strip())
                assert len(sent) == int(
                    items[0]), "line={}, sent={}, tag={}".format(
                        line, sent, tag)
        return data

    def _remove_empty_space(data):
        # Adapted from https://github.com/google-research/xtreme/blob/
        #              522434d1aece34131d997a97ce7e9242a51a688a/utils_preprocess.py#L212
        new_data = {}
        for split in data:
            new_data[split] = []
            for sent, tag, lines in data[split]:
                new_sent = [
                    "".join(w.replace("\u200c", "").split(" ")) for w in sent
                ]
                lines = [line.replace("\u200c", "") for line in lines]
                assert len(" ".join(new_sent).split(" ")) == len(tag)
                new_data[split].append((new_sent, tag, lines))
        return new_data

    def check_file(file):
        # Adapted from https://github.com/google-research/xtreme/blob/
        #              522434d1aece34131d997a97ce7e9242a51a688a/utils_preprocess.py#L223
        for i, l in enumerate(open(file)):
            items = l.strip().split("\t")
            assert len(items[0].split(" ")) == len(
                items[1].split(" ")), "idx={}, line={}".format(i, l)

    def _write_files(data, output_dir, lang_, suffix):
        # Adapted from https://github.com/google-research/xtreme/blob/
        #              522434d1aece34131d997a97ce7e9242a51a688a/utils_preprocess.py#L228
        for split in data:
            if len(data[split]) > 0:
                prefix = os.path.join(output_dir, f"{split}-{lang_}")
                if suffix == "mt":
                    path = prefix + ".mt.tsv"
                    if os.path.exists(path):
                        logger.info(
                            'Skip writing to %s since it already exists.',
                            path)
                    else:
                        with py_io.get_lock(path):
                            with open(path, "w") as fout:
                                for idx, (sent, tag,
                                          _) in enumerate(data[split]):
                                    newline = "\n" if idx != len(
                                        data[split]) - 1 else ""
                                    fout.write("{}\t{}{}".format(
                                        " ".join(sent), " ".join(tag),
                                        newline))
                            check_file(prefix + ".mt.tsv")
                            logger.info("    - finish checking " + prefix +
                                        ".mt.tsv")
                elif suffix == "tsv":
                    path = prefix + ".tsv"
                    if os.path.exists(path):
                        logger.info(
                            'Skip writing to %s since it already exists.',
                            path)
                    else:
                        with py_io.get_lock(path):
                            with open(path, "w") as fout:
                                for sidx, (sent, tag,
                                           _) in enumerate(data[split]):
                                    for widx, (w,
                                               t) in enumerate(zip(sent, tag)):
                                        newline = (
                                            "" if
                                            (sidx == len(data[split]) - 1) and
                                            (widx == len(sent) - 1) else "\n")
                                        fout.write("{}\t{}{}".format(
                                            w, t, newline))
                                    fout.write("\n")
                elif suffix == "conll":
                    path = prefix + ".conll"
                    if os.path.exists(path):
                        logger.info(
                            'Skip writing to %s since it already exists.',
                            path)
                    else:
                        with open(path, "w") as fout:
                            for _, _, lines in data[split]:
                                for line in lines:
                                    fout.write(line.strip() + "\n")
                                fout.write("\n")
                logger.info(f"finish writing file to {prefix}.{suffix}")

    languages = ("af ar bg de el en es et eu fa fi fr he hi hu id it ja "
                 "kk ko mr nl pt ru ta te th tl tr ur vi yo zh").split()
    udpos_temp_path = py_io.create_dir(task_data_base_path, "udpos_temp")
    download_utils.download_and_untar(
        "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3105/"
        "ud-treebanks-v2.5.tgz",
        udpos_temp_path,
    )
    download_utils.download_file(
        "https://raw.githubusercontent.com/google-research/xtreme/master/third_party/"
        "ud-conversion-tools/lib/conll.py",
        os.path.join(udpos_temp_path, "conll.py"),
    )
    conll = filesystem.import_from_path(
        os.path.join(udpos_temp_path, "conll.py"))
    conllu_path_ls = sorted(
        glob.glob(os.path.join(udpos_temp_path, "*", "*", "*.conllu")))
    conll_path = os.path.join(udpos_temp_path, "conll")

    # === Convert conllu files to conll === #
    for input_path in display.tqdm(
            conllu_path_ls, desc="Convert conllu files to conll format"):
        input_path_fol, input_path_file = os.path.split(input_path)
        lang = input_path_file.split("_")[0]
        os.makedirs(os.path.join(conll_path, lang), exist_ok=True)
        output_path = os.path.join(
            conll_path, lang,
            strings.replace_suffix(input_path_file, "conllu", "conll"))
        pos_rank_precedence_dict = {
            "default":
            ("VERB NOUN PROPN PRON ADJ NUM ADV INTJ AUX ADP DET PART CCONJ SCONJ X PUNCT"
             ).split(" "),
            "es":
            "VERB AUX PRON ADP DET".split(" "),
            "fr":
            "VERB AUX PRON NOUN ADJ ADV ADP DET PART SCONJ CONJ".split(" "),
            "it":
            "VERB AUX ADV PRON ADP DET INTJ".split(" "),
        }

        if lang in pos_rank_precedence_dict:
            current_pos_precedence_list = pos_rank_precedence_dict[lang]
        else:
            current_pos_precedence_list = pos_rank_precedence_dict["default"]

        cio = conll.CoNLLReader()
        orig_treebank = cio.read_conll_u(input_path)
        modif_treebank = copy.copy(orig_treebank)

        for s in modif_treebank:
            s.filter_sentence_content(
                replace_subtokens_with_fused_forms=True,
                posPreferenceDict=current_pos_precedence_list,
                node_properties_to_remove=False,
                remove_deprel_suffixes=False,
                remove_arabic_diacritics=False,
            )

        cio.write_conll(
            list_of_graphs=modif_treebank,
            conll_path=Path(output_path),
            conllformat="conll2006",
            print_fused_forms=True,
            print_comments=False,
        )

    # === Convert conll to final format === #
    for lang in display.tqdm(languages, desc="Convert conll to final format"):
        task_name = f"udpos_{lang}"
        task_data_path = os.path.join(task_data_base_path, task_name)
        os.makedirs(task_data_path, exist_ok=True)
        all_examples = {k: [] for k in ["train", "val", "test"]}
        for path in sorted(glob.glob(os.path.join(conll_path, lang,
                                                  "*.conll"))):
            examples = _read_one_file(path)
            if "train" in path:
                all_examples["train"] += examples
            elif "dev" in path:
                all_examples["val"] += examples
            elif "test" in path:
                all_examples["test"] += examples
            else:
                raise KeyError()
        all_examples = _remove_empty_space(all_examples)
        _write_files(
            data=all_examples,
            output_dir=task_data_path,
            lang_=lang,
            suffix="tsv",
        )
        paths_dict = {
            phase: os.path.join(task_data_path, f"{phase}-{lang}.tsv")
            for phase, phase_data in all_examples.items()
            if len(phase_data) > 0
        }
        py_io.write_json(
            data={
                "task": "udpos",
                "paths": paths_dict,
                "name": task_name,
                "kwargs": {
                    "language": lang
                },
            },
            path=os.path.join(task_config_base_path,
                              f"{task_name}_config.json"),
            skip_if_exists=True,
        )
    shutil.rmtree(udpos_temp_path)