Ejemplo n.º 1
0
def download_senteval_data_and_write_config(task_name: str,
                                            task_data_path: str,
                                            task_config_path: str):
    name_map = {
        "senteval_bigram_shift": "bigram_shift",
        "senteval_coordination_inversion": "coordination_inversion",
        "senteval_obj_number": "obj_number",
        "senteval_odd_man_out": "odd_man_out",
        "senteval_past_present": "past_present",
        "senteval_sentence_length": "sentence_length",
        "senteval_subj_number": "subj_number",
        "senteval_top_constituents": "top_constituents",
        "senteval_tree_depth": "tree_depth",
        "senteval_word_content": "word_content",
    }
    dataset_name = name_map[task_name]
    os.makedirs(task_data_path, exist_ok=True)
    # data contains all train/val/test examples, first column indicates the split
    data_path = os.path.join(task_data_path, "data.tsv")
    download_utils.download_file(
        url=
        "https://raw.githubusercontent.com/facebookresearch/SentEval/master/data/probing/"
        f"{dataset_name}.txt",
        file_path=data_path,
    )
    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "data": data_path
            },
            "name": task_name
        },
        path=task_config_path,
    )
Ejemplo n.º 2
0
def download_swag_data_and_write_config(task_name: str, task_data_path: str,
                                        task_config_path: str):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        "https://github.com/rowanz/swagaf/archive/master.zip",
        task_data_path,
    )
    for phase in ["train", "val", "test"]:
        os.rename(
            os.path.join(task_data_path, "swagaf-master", "data",
                         f"{phase}.csv"),
            os.path.join(task_data_path, f"{phase}.csv"),
        )
    shutil.rmtree(os.path.join(task_data_path, "swagaf-master"))
    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.csv"),
                "val": os.path.join(task_data_path, "val.csv"),
                "test": os.path.join(task_data_path, "test.csv"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Ejemplo n.º 3
0
def download_xnli_data_and_write_config(task_data_base_path: str, task_config_base_path: str):
    xnli_temp_path = py_io.create_dir(task_data_base_path, "xnli_temp")
    download_utils.download_and_unzip(
        "https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip", xnli_temp_path,
    )
    full_val_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.dev.jsonl"))
    val_data = datastructures.group_by(full_val_data, key_func=lambda elem: elem["language"])
    full_test_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.test.jsonl"))
    test_data = datastructures.group_by(full_test_data, lambda elem: elem["language"])
    languages = sorted(list(val_data))
    for lang in languages:
        task_name = f"xnli_{lang}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        val_path = os.path.join(task_data_path, "val.jsonl")
        test_path = os.path.join(task_data_path, "test.jsonl")
        py_io.write_jsonl(data=val_data[lang], path=val_path)
        py_io.write_jsonl(data=test_data[lang], path=test_path)
        py_io.write_json(
            data={
                "task": "xnli",
                "paths": {"val": val_path, "test": test_path},
                "name": task_name,
                "kwargs": {"language": lang},
            },
            path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
        )
    shutil.rmtree(xnli_temp_path)
Ejemplo n.º 4
0
def download_xquad_data_and_write_config(task_data_base_path: str,
                                         task_config_base_path: str):
    languages = "ar de el en es hi ru th tr vi zh".split()
    for lang in languages:
        task_name = f"xquad_{lang}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        path = os.path.join(task_data_path, "xquad.json")
        download_utils.download_file(
            url=
            f"https://raw.githubusercontent.com/deepmind/xquad/master/xquad.{lang}.json",
            file_path=path,
        )
        py_io.write_json(
            data={
                "task": "xquad",
                "paths": {
                    "val": path
                },
                "name": task_name,
                "kwargs": {
                    "language": lang
                },
            },
            path=os.path.join(task_config_base_path,
                              f"{task_name}_config.json"),
            skip_if_exists=True,
        )
def generate_data_config(
        data_base_path,
        output_base_path,
        train,
        val
):
    if 'cnli' in train:
        config = {"task":"counterfactual_nli", "paths":{}, "name":"counterfactual_nli"}

        if "seed" in train:
            config["paths"]["train"] = os.path.join(data_base_path,"counterfactual_nli","train_seed.jsonl")
        else:
            config["paths"]["train"] = os.path.join(data_base_path, "counterfactual_nli", "train.jsonl")
        config["paths"]["test"] = os.path.join(data_base_path, "counterfactual_nli", "test.jsonl")

        val_file = "cnli"

    elif 'snli' in train:
        config = {"task": "snli", "paths":{}, "name": "snli"}
        config["paths"]["train"] = os.path.join(data_base_path, train, "train.jsonl")
        config["paths"]["test"] = os.path.join(data_base_path, train, "test.jsonl")

        val_file = "snli"
    else:
        raise KeyError(f'{train} not supported')

    config["paths"]["val"] = os.path.join(data_base_path, "val", val, f"{val_file}.jsonl")

    os.makedirs(os.path.join(output_base_path, "configs"), exist_ok=True)
    py_io.write_json(
        data=config, path=os.path.join(output_base_path, "configs", f"{train}-{val}.json")
    )
Ejemplo n.º 6
0
def download_ropes_data_and_write_config(
    task_name: str, task_data_path: str, task_config_path: str
):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_untar(
        "https://ropes-dataset.s3-us-west-2.amazonaws.com/train_and_dev/"
        "ropes-train-dev-v1.0.tar.gz",
        task_data_path,
    )
    data_phase_list = ["train", "dev"]
    jiant_phase_list = ["train", "val"]
    for data_phase, jiant_phase in zip(data_phase_list, jiant_phase_list):
        os.rename(
            os.path.join(task_data_path, "ropes-train-dev-v1.0", f"{data_phase}-v1.0.json"),
            os.path.join(task_data_path, f"{jiant_phase}.json"),
        )
    shutil.rmtree(os.path.join(task_data_path, "ropes-train-dev-v1.0"))
    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.json"),
                "val": os.path.join(task_data_path, "val.json"),
            },
            "name": task_name,
            "kwargs": {"include_background": True},
        },
        path=task_config_path,
    )
Ejemplo n.º 7
0
def download_arct_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str):
    os.makedirs(task_data_path, exist_ok=True)
    file_name_list = [
        "train-doubled.tsv",
        "train-w-swap-doubled.tsv",
        "train-w-swap.tsv",
        "train.tsv",
        "dev.tsv",
        "test.tsv",
    ]
    for file_name in file_name_list:
        download_utils.download_file(
            f"https://raw.githubusercontent.com/UKPLab/argument-reasoning-comprehension-task/"
            + f"master/experiments/src/main/python/data/{file_name}",
            os.path.join(task_data_path, file_name),
        )
    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.tsv"),
                "val": os.path.join(task_data_path, "val.tsv"),
                "test": os.path.join(task_data_path, "test.tsv"),
                "train_doubled": os.path.join(task_data_path, "train-doubled.tsv"),
                "train_w_swap": os.path.join(task_data_path, "train-w-swap.tsv"),
                "train_w_swap_doubled": os.path.join(task_data_path, "train-w-swap-doubled.tsv"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Ejemplo n.º 8
0
def download_mutual_plus_data_and_write_config(
    task_name: str, task_data_path: str, task_config_path: str
):
    os.makedirs(task_data_path, exist_ok=True)
    os.makedirs(task_data_path + "/train", exist_ok=True)
    os.makedirs(task_data_path + "/dev", exist_ok=True)
    os.makedirs(task_data_path + "/test", exist_ok=True)
    num_files = {"train": 7088, "dev": 886, "test": 886}
    for phase in num_files:
        examples = []
        for i in range(num_files[phase]):
            file_name = phase + "_" + str(i + 1) + ".txt"
            download_utils.download_file(
                f"https://raw.githubusercontent.com/Nealcly/MuTual/"
                + f"master/data/mutual_plus/{phase}/{file_name}",
                os.path.join(task_data_path, phase, file_name),
            )
            for line in py_io.read_file_lines(os.path.join(task_data_path, phase, file_name)):
                examples.append(line)
        py_io.write_jsonl(examples, os.path.join(task_data_path, phase + ".jsonl"))
        shutil.rmtree(os.path.join(task_data_path, phase))

    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.jsonl"),
                "val": os.path.join(task_data_path, "dev.jsonl"),
                "test": os.path.join(task_data_path, "test.jsonl"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Ejemplo n.º 9
0
def download_data_and_write_config(task_name: str, task_data_path: str,
                                   task_config_path: str):
    hf_datasets_conversion_metadata = HF_DATASETS_CONVERSION_DICT[task_name]
    examples_dict = download_utils.convert_hf_dataset_to_examples(
        path=hf_datasets_conversion_metadata["path"],
        name=hf_datasets_conversion_metadata.get("name"),
        field_map=hf_datasets_conversion_metadata.get("field_map"),
        label_map=hf_datasets_conversion_metadata.get("label_map"),
        phase_map=hf_datasets_conversion_metadata.get("phase_map",
                                                      DEFAULT_PHASE_MAP),
        phase_list=hf_datasets_conversion_metadata.get("phase_list"),
    )
    paths_dict = download_utils.write_examples_to_jsonls(
        examples_dict=examples_dict,
        task_data_path=task_data_path,
    )
    jiant_task_name = hf_datasets_conversion_metadata.get(
        "jiant_task_name", task_name)
    py_io.write_json(
        data={
            "task": jiant_task_name,
            "paths": paths_dict,
            "name": task_name
        },
        path=task_config_path,
    )
Ejemplo n.º 10
0
def download_qasrl_data_and_write_config(
    task_name: str, task_data_path: str, task_config_path: str
):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_untar(
        "http://qasrl.org/data/qasrl-v2.tar", task_data_path,
    )
    data_phase_list = ["train", "dev", "test"]
    jiant_phase_list = ["train", "val", "test"]
    for data_phase, jiant_phase in zip(data_phase_list, jiant_phase_list):
        os.rename(
            os.path.join(task_data_path, "qasrl-v2", "orig", f"{data_phase}.jsonl.gz"),
            os.path.join(task_data_path, f"{jiant_phase}.jsonl.gz"),
        )
    shutil.rmtree(os.path.join(task_data_path, "qasrl-v2"))
    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.jsonl.gz"),
                "val": os.path.join(task_data_path, "val.jsonl.gz"),
                "test": os.path.join(task_data_path, "test.jsonl.gz"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Ejemplo n.º 11
0
def download_qamr_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        "https://github.com/uwnlp/qamr/archive/master.zip", task_data_path,
    )
    data_phase_list = ["train", "dev", "test"]
    jiant_phase_list = ["train", "val", "test"]
    for data_phase, jiant_phase in zip(data_phase_list, jiant_phase_list):
        os.rename(
            os.path.join(task_data_path, "qamr-master", "data", "filtered", f"{data_phase}.tsv"),
            os.path.join(task_data_path, f"{jiant_phase}.tsv"),
        )
    os.rename(
        os.path.join(task_data_path, "qamr-master", "data", "wiki-sentences.tsv"),
        os.path.join(task_data_path, "wiki-sentences.tsv"),
    )
    shutil.rmtree(os.path.join(task_data_path, "qamr-master"))
    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.tsv"),
                "val": os.path.join(task_data_path, "val.tsv"),
                "test": os.path.join(task_data_path, "test.tsv"),
                "wiki_dict": os.path.join(task_data_path, "wiki-sentences.tsv"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Ejemplo n.º 12
0
def write_val_results(val_results_dict,
                      metrics_aggregator,
                      output_dir,
                      verbose=True):
    full_results_to_write = {
        "aggregated":
        jiant_task_sampler.compute_aggregate_major_metrics_from_results_dict(
            metrics_aggregator=metrics_aggregator,
            results_dict=val_results_dict,
        ),
    }
    for task_name, task_results in val_results_dict.items():
        task_results_to_write = {}
        if "loss" in task_results:
            task_results_to_write["loss"] = task_results["loss"]
        if "metrics" in task_results:
            task_results_to_write["metrics"] = task_results["metrics"].to_dict(
            )
        full_results_to_write[task_name] = task_results_to_write

    metrics_str = json.dumps(full_results_to_write, indent=2)
    if verbose:
        print(metrics_str)

    py_io.write_json(data=full_results_to_write,
                     path=os.path.join(output_dir, "val_metrics.json"))
Ejemplo n.º 13
0
def download_winogrande_data_and_write_config(
    task_name: str, task_data_path: str, task_config_path: str
):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        "https://storage.googleapis.com/ai2-mosaic/public/winogrande/winogrande_1.1.zip",
        task_data_path,
    )

    task_data_path = os.path.join(task_data_path, "winogrande_1.1")

    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train_xl.jsonl"),
                "train_labels": os.path.join(task_data_path, "train_xl-labels.lst"),
                "train_xs": os.path.join(task_data_path, "train_xs.jsonl"),
                "train_xs_labels": os.path.join(task_data_path, "train_xs-labels.lst"),
                "train_s": os.path.join(task_data_path, "train_s.jsonl"),
                "train_s_labels": os.path.join(task_data_path, "train_s-labels.lst"),
                "train_m": os.path.join(task_data_path, "train_m.jsonl"),
                "train_m_labels": os.path.join(task_data_path, "train_m-labels.lst"),
                "train_l": os.path.join(task_data_path, "train_l.jsonl"),
                "train_l_labels": os.path.join(task_data_path, "train_l-labels.lst"),
                "train_xl": os.path.join(task_data_path, "train_xl.jsonl"),
                "train_xl_labels": os.path.join(task_data_path, "train_xl-labels.lst"),
                "val": os.path.join(task_data_path, "dev.jsonl"),
                "val_labels": os.path.join(task_data_path, "dev-labels.lst"),
                "test": os.path.join(task_data_path, "test.jsonl"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Ejemplo n.º 14
0
def full_chunk_and_save(task, phase, examples, feat_spec, tokenizer, args: RunConfiguration):
    """Convert Examples to ListDataset, optionally truncate sequences if possible, and save to disk.

    Args:
        task: Task object
        phase (str): string identifying the data subset (e.g., train, val or test).
        examples (list[Example]): list of task Examples.
        feat_spec: (FeaturizationSpec): Tokenization-related metadata.
        tokenizer: TODO  (issue #1188)
        args (RunConfiguration): run configuration object.

    """
    dataset = preprocessing.convert_examples_to_dataset(
        task=task,
        examples=examples,
        feat_spec=feat_spec,
        tokenizer=tokenizer,
        phase=phase,
        verbose=True,
    )
    if args.smart_truncate:
        dataset, length = preprocessing.smart_truncate(
            dataset=dataset, max_seq_length=args.max_seq_length, verbose=True,
        )
        os.makedirs(os.path.join(args.output_dir, phase), exist_ok=True)
        py_io.write_json(
            data={"truncated_to": int(length)},
            path=os.path.join(args.output_dir, phase, "smart_truncate.json"),
        )
    shared_caching.chunk_and_save(
        data=dataset.data,
        chunk_size=args.chunk_size,
        data_args=args.to_dict(),
        output_dir=os.path.join(args.output_dir, phase),
    )
Ejemplo n.º 15
0
def download_squad_data_and_write_config(
    task_name: str, task_data_path: str, task_config_path: str
):
    if task_name == "squad_v1":
        train_file = "train-v1.1.json"
        dev_file = "dev-v1.1.json"
        version_2_with_negative = False
    elif task_name == "squad_v2":
        train_file = "train-v2.0.json"
        dev_file = "dev-v2.0.json"
        version_2_with_negative = True
    else:
        raise KeyError(task_name)

    os.makedirs(task_data_path, exist_ok=True)
    train_path = os.path.join(task_data_path, train_file)
    val_path = os.path.join(task_data_path, dev_file)
    download_utils.download_file(
        url=f"https://rajpurkar.github.io/SQuAD-explorer/dataset/{train_file}",
        file_path=train_path,
    )
    download_utils.download_file(
        url=f"https://rajpurkar.github.io/SQuAD-explorer/dataset/{dev_file}", file_path=val_path,
    )
    py_io.write_json(
        data={
            "task": "squad",
            "paths": {"train": train_path, "val": val_path},
            "version_2_with_negative": version_2_with_negative,
            "name": task_name,
        },
        path=task_config_path,
    )
Ejemplo n.º 16
0
def download_pawsx_data_and_write_config(task_data_base_path: str, task_config_base_path: str):
    pawsx_temp_path = py_io.create_dir(task_data_base_path, "pawsx_temp")
    download_utils.download_and_untar(
        "https://storage.googleapis.com/paws/pawsx/x-final.tar.gz", pawsx_temp_path,
    )
    languages = sorted(os.listdir(os.path.join(pawsx_temp_path, "x-final")))
    for lang in languages:
        task_name = f"pawsx_{lang}"
        os.rename(
            src=os.path.join(pawsx_temp_path, "x-final", lang),
            dst=os.path.join(task_data_base_path, task_name),
        )
        paths_dict = {
            "val": os.path.join(task_data_base_path, task_name, "dev_2k.tsv"),
            "test": os.path.join(task_data_base_path, task_name, "test_2k.tsv"),
        }
        if lang == "en":
            paths_dict["train"] = os.path.join(task_data_base_path, task_name, "train.tsv")
            datastructures.set_dict_keys(paths_dict, ["train", "val", "test"])
        py_io.write_json(
            data={
                "task": "pawsx",
                "paths": paths_dict,
                "name": task_name,
                "kwargs": {"language": lang},
            },
            path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
        )
    shutil.rmtree(pawsx_temp_path)
Ejemplo n.º 17
0
def download_mlqa_data_and_write_config(task_data_base_path: str, task_config_base_path: str):
    mlqa_temp_path = py_io.create_dir(task_data_base_path, "mlqa_temp")
    download_utils.download_and_unzip(
        "https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip", mlqa_temp_path,
    )
    languages = "ar de en es hi vi zh".split()
    for lang1, lang2 in itertools.product(languages, languages):
        task_name = f"mlqa_{lang1}_{lang2}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        val_path = os.path.join(task_data_path, f"dev-context-{lang1}-question-{lang2}.json")
        os.rename(
            src=os.path.join(
                mlqa_temp_path, "MLQA_V1", "dev", f"dev-context-{lang1}-question-{lang2}.json"
            ),
            dst=val_path,
        )
        test_path = os.path.join(task_data_path, f"test-context-{lang1}-question-{lang2}.json")
        os.rename(
            src=os.path.join(
                mlqa_temp_path, "MLQA_V1", "test", f"test-context-{lang1}-question-{lang2}.json"
            ),
            dst=test_path,
        )
        py_io.write_json(
            data={
                "task": "mlqa",
                "paths": {"val": val_path, "test": test_path},
                "kwargs": {"context_language": lang1, "question_language": lang2},
                "name": task_name,
            },
            path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
        )
    shutil.rmtree(mlqa_temp_path)
Ejemplo n.º 18
0
def download_panx_data_and_write_config(task_data_base_path: str, task_config_base_path: str):
    def _process_one_file(infile, outfile):
        lines = open(infile, "r").readlines()
        if lines[-1].strip() == "":
            lines = lines[:-1]
        with open(outfile, "w") as fout:
            for line in lines:
                items = line.strip().split("\t")
                if len(items) == 2:
                    label = items[1].strip()
                    idx = items[0].find(":")
                    if idx != -1:
                        token = items[0][idx + 1 :].strip()
                        fout.write(f"{token}\t{label}\n")
                else:
                    fout.write("\n")

    panx_temp_path = os.path.join(task_data_base_path, "panx_temp")
    zip_path = os.path.join(panx_temp_path, "AmazonPhotos.zip")
    assert os.path.exists(zip_path), (
        "Download AmazonPhotos.zip from"
        " https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN"
        f" and save it to {zip_path}"
    )
    download_utils.unzip_file(zip_path=zip_path, extract_location=panx_temp_path)
    languages = (
        "af ar bg bn de el en es et eu fa fi fr he hi hu id it ja jv ka "
        "kk ko ml mr ms my nl pt ru sw ta te th tl tr ur vi yo zh"
    ).split()
    for lang in languages:
        task_name = f"panx_{lang}"
        untar_path = os.path.join(panx_temp_path, "panx_dataset", lang)
        os.makedirs(untar_path, exist_ok=True)
        download_utils.untar_file(
            tar_path=os.path.join(panx_temp_path, "panx_dataset", f"{lang}.tar.gz"),
            extract_location=untar_path,
            delete=True,
        )
        task_data_path = os.path.join(task_data_base_path, task_name)
        os.makedirs(task_data_path, exist_ok=True)
        filename_dict = {"train": "train", "val": "dev", "test": "test"}
        paths_dict = {}
        for phase, filename in filename_dict.items():
            in_path = os.path.join(untar_path, filename)
            out_path = os.path.join(task_data_path, f"{phase}.tsv")
            if not os.path.exists(in_path):
                continue
            _process_one_file(infile=in_path, outfile=out_path)
            paths_dict[phase] = out_path
        py_io.write_json(
            data={
                "task": "panx",
                "paths": paths_dict,
                "name": task_name,
                "kwargs": {"language": lang},
            },
            path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
        )
    shutil.rmtree(os.path.join(panx_temp_path, "panx_dataset"))
Ejemplo n.º 19
0
def main():
    full_cl_args = zconf.core.get_sys_args()
    assert len(full_cl_args) >= 1, "Require two arguments to start: configurator and out_path"
    configurator_name, config_path, *cl_args = full_cl_args
    configurator = Registry.get_configurator(configurator_name=configurator_name)
    config_dict = configurator.default_run_cli(cl_args=cl_args).create_config()
    os.makedirs(os.path.split(config_path)[0], exist_ok=True)
    py_io.write_json(config_dict, path=config_path)
Ejemplo n.º 20
0
def download_fever_nli_data_and_write_config(
    task_name: str, task_data_path: str, task_config_path: str
):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        ("https://www.dropbox.com/s/hylbuaovqwo2zav/nli_fever.zip?dl=1"), task_data_path,
    )
    # Since the FEVER NLI dataset doesn't have labels for the dev set, we also download the original
    # FEVER dev set and match example CIDs to obtain labels.
    orig_dev_path = os.path.join(task_data_path, "fever-dev-temp.jsonl")
    download_utils.download_file(
        "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl", orig_dev_path,
    )
    id_to_label = {}
    for line in py_io.read_jsonl(orig_dev_path):
        if "id" not in line:
            logging.warning("FEVER dev dataset is missing ID.")
            continue
        if "label" not in line:
            logging.warning("FEVER dev dataset is missing label.")
            continue
        id_to_label[line["id"]] = line["label"]
    os.remove(orig_dev_path)

    dev_path = os.path.join(task_data_path, "nli_fever", "dev_fitems.jsonl")
    dev_examples = []
    for line in py_io.read_jsonl(dev_path):
        if "cid" not in line:
            logging.warning("Data in {} is missing CID.".format(dev_path))
            continue
        if int(line["cid"]) not in id_to_label:
            logging.warning("Could not match CID {} to dev data.".format(line["cid"]))
            continue
        dev_example = line
        dev_example["label"] = id_to_label[int(line["cid"])]
        dev_examples.append(dev_example)
    py_io.write_jsonl(dev_examples, os.path.join(task_data_path, "val.jsonl"))
    os.remove(dev_path)

    for phase in ["train", "test"]:
        os.rename(
            os.path.join(task_data_path, "nli_fever", f"{phase}_fitems.jsonl"),
            os.path.join(task_data_path, f"{phase}.jsonl"),
        )
    shutil.rmtree(os.path.join(task_data_path, "nli_fever"))

    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.jsonl"),
                "val": os.path.join(task_data_path, "val.jsonl"),
                "test": os.path.join(task_data_path, "test.jsonl"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Ejemplo n.º 21
0
def export_model(
    model_type: str,
    output_base_path: str,
    model_class: Type[transformers.PreTrainedModel],
    tokenizer_class: Type[transformers.PreTrainedTokenizer],
    hf_model_name: str = None,
    skip_if_exists: bool = True,
):
    """Retrieve model and tokenizer from Transformers and save all necessary data
    Things saved:
    - Model weights
    - Model config JSON (corresponding to corresponding Transformers model Config object)
    - Tokenizer data
    - JSON file pointing to paths for the above
    Args:
        model_type: Model-type string. See: `get_model_and_tokenizer_classes`
        output_base_path: Base path to save output to
        model_class: Model class
        tokenizer_class: Tokenizer class
        hf_model_name: (Optional) hf_model_name from https://huggingface.co/models,
                       if it differs from model_type
    """
    if hf_model_name is None:
        hf_model_name = model_type

    model_fol_path = os.path.join(output_base_path, "model")

    model_path = os.path.join(model_fol_path, f"{model_type}.p")
    os.makedirs(os.path.dirname(model_path), exist_ok=True)  # Necessary since some models are named "facebook/bart-base"
    model = model_class.from_pretrained(hf_model_name)
    with py_io.get_lock(model_path):
        if skip_if_exists and os.path.exists(model_path):
            logger.info('Skip writing to %s since it already exists.', model_path)
        else:
            torch.save(model.state_dict(), model_path)

    model_config_path = os.path.join(model_fol_path, f"{model_type}.json")
    os.makedirs(os.path.dirname(model_config_path), exist_ok=True)
    py_io.write_json(model.config.to_dict(), model_config_path, skip_if_exists=skip_if_exists)

    tokenizer_fol_path = os.path.join(output_base_path, "tokenizer")
    # os.makedirs(tokenizer_fol_path, exist_ok=True)
    tokenizer = tokenizer_class.from_pretrained(hf_model_name)
    with py_io.get_lock(tokenizer_fol_path):
        if skip_if_exists and os.path.exists(tokenizer_fol_path):
            logger.info('Skip writing to %s since it already exists.', tokenizer_fol_path)
        else:
            tokenizer.save_pretrained(tokenizer_fol_path)

    config = {
        "model_type": model_type,
        "model_path": model_path,
        "model_config_path": model_config_path,
        "model_tokenizer_path": tokenizer_fol_path,
    }
    py_io.write_json(config, os.path.join(output_base_path, f"config.json"), skip_if_exists=skip_if_exists)
Ejemplo n.º 22
0
def save_model_with_metadata(model: nn.Module,
                             metadata: dict,
                             output_dir: str,
                             file_name="model"):
    torch.save(
        torch_utils.get_model_for_saving(model).state_dict(),
        os.path.join(output_dir, f"{file_name}.p"),
    )
    py_io.write_json(metadata,
                     os.path.join(output_dir, f"{file_name}.metadata.json"))
Ejemplo n.º 23
0
def main(args: RunConfiguration):
    os.makedirs(os.path.split(args.output_path)[0], exist_ok=True)
    py_io.write_json(
        single_task_config(
            task_config_path=args.task_config_path,
            task_cache_base_path=args.task_cache_base_path,
            train_batch_size=args.train_batch_size,
            epochs=args.epochs,
        ),
        path=args.output_path,
    )
Ejemplo n.º 24
0
 def save_model(self):
     """Override to save only optimized parameters"""
     file_name = f"model__{self.train_state.global_steps:09d}"
     torch.save(
         adapters_modeling.
         get_optimized_state_dict_for_jiant_model_with_adapters(
             torch_utils.get_model_for_saving(self.model)),
         os.path.join(self.output_dir, f"{file_name}.p"),
     )
     py_io.write_json(
         "{}", os.path.join(self.output_dir, f"{file_name}.metadata.json"))
Ejemplo n.º 25
0
def create_and_write_task_config(task_name, task_data_dir, task_config_path):
    task_config_templates = py_io.read_json(
        py_filesystem.get_code_asset_path(
            "assets/simple_api/task_config_templates.json"))
    task_config = get_task_config(
        task_config_templates=task_config_templates,
        task_name=task_name,
        task_data_dir=task_data_dir,
    )
    os.makedirs(os.path.split(task_config_path)[0], exist_ok=True)
    py_io.write_json(task_config, task_config_path)
Ejemplo n.º 26
0
def save_model_with_metadata(model: nn.Module,
                             metadata: dict,
                             output_dir: str,
                             file_name="model"):
    torch.save(
        adapters_modeling.
        get_optimized_state_dict_for_jiant_model_with_adapters(
            torch_utils.get_model_for_saving(model)),
        os.path.join(output_dir, f"{file_name}.p"),
    )
    py_io.write_json(metadata,
                     os.path.join(output_dir, f"{file_name}.metadata.json"))
Ejemplo n.º 27
0
def export_model(
    model_type: str,
    output_base_path: str,
    layer: int,
    model_class: Type[transformers.PreTrainedModel],
    tokenizer_class: Type[transformers.PreTrainedTokenizer],
    hf_model_name: str = None,
):
    """Retrieve model and tokenizer from Transformers and save all necessary data
    Things saved:
    - Model weights
    - Model config JSON (corresponding to corresponding Transformers model Config object)
    - Tokenizer data
    - JSON file pointing to paths for the above
    Args:
        model_type: Model-type string. See: `get_model_and_tokenizer_classes`
        output_base_path: Base path to save output to
        model_class: Model class
        tokenizer_class: Tokenizer class
        hf_model_name: (Optional) hf_model_name from https://huggingface.co/models,
                       if it differs from model_type
    """
    if hf_model_name is None:
        hf_model_name = model_type

    tokenizer_fol_path = os.path.join(output_base_path, "tokenizer")
    model_fol_path = os.path.join(output_base_path, "model")
    os.makedirs(tokenizer_fol_path, exist_ok=True)
    os.makedirs(model_fol_path, exist_ok=True)

    model_path = os.path.join(model_fol_path, f"{model_type}.p")
    model_config_path = os.path.join(model_fol_path, f"{model_type}.json")
    model = model_class.from_pretrained(hf_model_name)

    for layer_idx in range(12):
        if (layer_idx != layer):
            for param in list(
                    model.bert.encoder.layer[layer_idx].parameters()):
                param.requires_grad = False
    print(f"froze all layers except {layer}")

    torch.save(model.state_dict(), model_path)
    py_io.write_json(model.config.to_dict(), model_config_path)
    tokenizer = tokenizer_class.from_pretrained(hf_model_name)
    tokenizer.save_pretrained(tokenizer_fol_path)
    config = {
        "model_type": model_type,
        "model_path": model_path,
        "model_config_path": model_config_path,
        "model_tokenizer_path": tokenizer_fol_path,
    }
    py_io.write_json(config, os.path.join(output_base_path, f"config.json"))
Ejemplo n.º 28
0
def save_model_with_metadata(
    model_or_state_dict: Union[nn.Module, dict],
    output_dir: str,
    file_name="model",
    metadata: Optional[dict] = None,
):
    if isinstance(model_or_state_dict, dict):
        state_dict = model_or_state_dict
    else:
        state_dict = torch_utils.get_model_for_saving(model_or_state_dict).state_dict()

    torch.save(state_dict, os.path.join(output_dir, f"{file_name}.p"))
    if metadata is not None:
        py_io.write_json(metadata, os.path.join(output_dir, f"{file_name}.metadata.json"))
Ejemplo n.º 29
0
def export_model(
    hf_pretrained_model_name_or_path: str,
    output_base_path: str,
):
    """Retrieve model and tokenizer from Transformers and save all necessary data
    Things saved:
    - Model weights
    - Model config JSON (corresponding to corresponding Transformers model Config object)
    - Tokenizer data
    - JSON file pointing to paths for the above
    Args:
        hf_pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
                        Can be either:

                            - A string, the `model id` of a pretrained model configuration
                              hosted inside a model repo on okhuggingface.co.
                              Valid model ids can be located at the root-level, like
                              ``bert-base-uncased``, or namespaced under a user
                              or organization name, like ``dbmdz/bert-base-german-cased``.
                            - A path to a `directory` containing a configuration file saved using
                              the :meth:`~transformers.PretrainedConfig.save_pretrained` method,
                              or the
                              :meth:`~transformers.PreTrainedModel.save_pretrained` method,
                              e.g., ``./my_model_directory/``.
                            - A path or url to a saved configuration JSON `file`, e.g.,
                              ``./my_model_directory/configuration.json``.
        output_base_path: Base path to save output to
    """
    model = AutoModelForPreTraining.from_pretrained(
        hf_pretrained_model_name_or_path)
    model_type = model.config_class.model_type

    model_fol_path = os.path.join(output_base_path, "model")
    model_path = os.path.join(model_fol_path, f"{model_type}.p")
    model_config_path = os.path.join(model_fol_path, f"{model_type}.json")
    tokenizer_fol_path = os.path.join(output_base_path, "tokenizer")

    os.makedirs(tokenizer_fol_path, exist_ok=True)
    os.makedirs(model_fol_path, exist_ok=True)

    torch.save(model.state_dict(), model_path)
    py_io.write_json(model.config.to_dict(), model_config_path)
    tokenizer = AutoTokenizer.from_pretrained(hf_pretrained_model_name_or_path)
    tokenizer.save_pretrained(tokenizer_fol_path)
    config = {
        "model_type": model_type,
        "model_path": model_path,
        "model_config_path": model_config_path,
    }
    py_io.write_json(config, os.path.join(output_base_path, "config.json"))
Ejemplo n.º 30
0
def preprocess_all_glue_data(input_base_path, output_base_path):
    os.makedirs(output_base_path, exist_ok=True)
    os.makedirs(os.path.join(output_base_path, "data"), exist_ok=True)
    os.makedirs(os.path.join(output_base_path, "configs"), exist_ok=True)
    for task_name in tqdm.tqdm(GLUE_CONVERSION):
        task_data_path = os.path.join(output_base_path, "data", task_name)
        paths_dict = convert_glue_data(
            input_base_path=input_base_path,
            task_data_path=task_data_path,
            task_name=task_name,
        )
        config = {"task": task_name, "paths": paths_dict, "name": task_name}
        py_io.write_json(data=config,
                         path=os.path.join(output_base_path, "configs",
                                           f"{task_name}.json"))