コード例 #1
0
 def _create_examples(self, set_type):
     data = read_json(self.path_dict["data"])
     metadata = read_json(self.path_dict["metadata"])
     assert len(data) == len(metadata)
     examples = []
     for data_row, metadata_row in zip(data, metadata):
         row_phase = self.DATA_PHASE_MAP[metadata_row["misc"][self.fold]]
         if row_phase != set_type:
             continue
         examples.append(
             Example(
                 guid=data_row["pair-id"], text=data_row["context"], label=data_row["label"],
             )
         )
     return examples
コード例 #2
0
    def compute_metrics_from_accumulator(
        self, task, accumulator: BaseAccumulator, tokenizer, labels
    ) -> Metrics:

        # Todo: Fix val labels cache
        # This is a quick hack
        logits = accumulator.get_accumulated()
        partial_examples = squad_style.data_rows_to_partial_examples(data_rows=labels)
        all_pred_results = squad_style.logits_to_pred_results_list(logits)
        assert task.context_language == task.question_language
        lang = task.context_language
        predictions = squad_style_utils.compute_predictions_logits_v2(
            partial_examples=partial_examples,
            all_results=all_pred_results,
            n_best_size=task.n_best_size,
            max_answer_length=task.max_answer_length,
            do_lower_case=model_resolution.resolve_is_lower_case(tokenizer),
            version_2_with_negative=task.version_2_with_negative,
            null_score_diff_threshold=task.null_score_diff_threshold,
            tokenizer=tokenizer,
            skip_get_final_text=(lang == "zh"),
            verbose=True,
        )
        dataset = read_json(task.val_path)["data"]
        results = mlqa_lib.evaluate(dataset=dataset, predictions=predictions, lang=lang,)
        return Metrics(major=(results["f1"] + results["exact_match"]) / 2, minor=results,)
コード例 #3
0
ファイル: container_setup.py プロジェクト: v-mipeng/jiant
def create_jiant_task_container_from_json(
    jiant_task_container_config_path: str, verbose: bool = True
) -> JiantTaskContainer:
    return create_jiant_task_container_from_dict(
        jiant_task_container_config_dict=py_io.read_json(jiant_task_container_config_path),
        verbose=verbose,
    )
コード例 #4
0
ファイル: xtreme_submission.py プロジェクト: HonoMi/jiant
def run_loop(args: RunConfiguration):
    quick_init_out = initialization.quick_init(args=args, verbose=True)
    with quick_init_out.log_writer.log_context():
        if args.jiant_task_container_path:
            jiant_task_container = container_setup.create_jiant_task_container(
                **py_io.read_json(args.jiant_task_container_path)
            )
        else:
            raise RuntimeError("Need `jiant_task_container_path` or individual config paths")
        runner = setup_runner(
            args=args,
            jiant_task_container=jiant_task_container,
            quick_init_out=quick_init_out,
            verbose=True,
        )
    supertask, output_dir = args.supertask, args.output_dir
    if supertask in ["xnli", "pawsx"]:
        generate_and_write_preds_for_classification(
            runner=runner,
            supertask=supertask,
            output_dir=output_dir,
            skip_if_done=args.skip_if_done,
        )
    elif supertask in ["udpos", "panx"]:
        generate_and_write_preds_for_tagging(
            runner=runner,
            supertask=supertask,
            output_dir=output_dir,
            skip_if_done=args.skip_if_done,
        )
    elif supertask in ["xquad", "mlqa"]:
        generate_and_write_preds_for_qa(
            runner=runner,
            supertask=supertask,
            output_dir=output_dir,
            phase="test",
            skip_if_done=args.skip_if_done,
        )
    elif supertask == "tydiqa":
        generate_and_write_preds_for_qa(
            runner=runner,
            supertask="tydiqa",
            output_dir=output_dir,
            phase="val",
            skip_if_done=args.skip_if_done,
        )
    elif supertask == "bucc2018":
        generate_and_write_preds_for_bucc2018(
            runner=runner,
            output_dir=output_dir,
            bucc_val_metrics_path=args.bucc_val_metrics_path,
            skip_if_done=args.skip_if_done,
        )
    elif supertask == "tatoeba":
        generate_and_write_preds_for_tatoeba(
            runner=runner, output_dir=output_dir, skip_if_done=args.skip_if_done,
        )
    else:
        raise KeyError(supertask)
コード例 #5
0
def main():
    mode, cl_args = zconf.get_mode_and_cl_args()
    if mode == "json":
        args = JsonRunConfiguration.default_run_cli(cl_args=cl_args)
        config_dict = Registry.func_dict[args.func](**py_io.read_json(args.path))
        write_configs(
            config_dict=config_dict, base_path=args.output_base_path,
        )
    else:
        raise zconf.ModeLookupError(mode)
コード例 #6
0
ファイル: xtreme_submission.py プロジェクト: yzpang/jiant
def generate_and_write_preds_for_bucc2018(runner,
                                          output_dir: str,
                                          bucc_val_metrics_path: str,
                                          skip_if_done: bool = False):
    """Generate predictions (test) for Bucc2018 and write them in XTREME submission format"""
    preds_pickle_path = os.path.join(output_dir, "bucc2018_test_preds.p")
    if skip_if_done and os.path.exists(preds_pickle_path):
        print(f"Skipping cause {preds_pickle_path} exists")
        return
    else:
        print(f"{preds_pickle_path} does not exist")
    if bucc_val_metrics_path is None:
        # Recompute thresholds:
        val_results_dict = runner.run_val(
            task_name_list=runner.jiant_task_container.task_run_config.
            val_task_list,
            return_preds=True,
        )
        jiant_evaluate.write_preds(
            eval_results_dict=val_results_dict,
            path=os.path.join(output_dir, "bucc2018_val_preds.p"),
        )
        thresholds_dict = {
            task_name: task_results["metrics"].minor["best-threshold"]
            for task_name, task_results in val_results_dict.items()
        }
    else:
        val_metrics = py_io.read_json(bucc_val_metrics_path)
        thresholds_dict = {
            task_name:
            val_metrics[task_name]["metrics"]["minor"]["best-threshold"]
            for task_name in
            runner.jiant_task_container.task_run_config.val_task_list
        }

    preds_output_dir = os.path.join(output_dir, "preds", "bucc2018")
    os.makedirs(preds_output_dir, exist_ok=True)
    test_results_dict = runner.run_test(
        task_name_list=runner.jiant_task_container.task_run_config.
        test_task_list, )
    jiant_evaluate.write_preds(
        eval_results_dict=test_results_dict,
        path=preds_pickle_path,
    )
    for task_name, task_results in test_results_dict.items():
        bitext = bucc2018_lib.bucc_extract(
            cand2score=task_results["preds"],
            th=thresholds_dict[task_name],
        )
        lang = runner.jiant_task_container.task_dict[task_name].language
        with open(os.path.join(preds_output_dir, f"test-{lang}.tsv"),
                  "w") as f:
            for src, trg in bitext:
                f.write(f"{src}\t{trg}\n")
    print(f"Wrote Bucc2018 preds for {len(test_results_dict)} languages")
コード例 #7
0
def create_and_write_task_config(task_name, task_data_dir, task_config_path):
    task_config_templates = py_io.read_json(
        py_filesystem.get_code_asset_path(
            "assets/simple_api/task_config_templates.json"))
    task_config = get_task_config(
        task_config_templates=task_config_templates,
        task_name=task_name,
        task_data_dir=task_data_dir,
    )
    os.makedirs(os.path.split(task_config_path)[0], exist_ok=True)
    py_io.write_json(task_config, task_config_path)
コード例 #8
0
ファイル: ropes.py プロジェクト: v-mipeng/jiant
    def read_examples(self, path, set_type):
        input_data = read_json(path, encoding="utf-8")["data"]

        is_training = set_type == PHASE.TRAIN
        examples = []
        data = take_one(input_data)
        for paragraph in maybe_tqdm(data["paragraphs"]):
            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                # Because answers can also come from questions, we're going to abuse notation
                #   slightly and put the entire background+situation+question into the "context"
                #   and leave nothing for the "question"
                question_text = " "
                if self.include_background:
                    context_segments = [
                        paragraph["background"],
                        paragraph["situation"],
                        qa["question"],
                    ]
                else:
                    context_segments = [paragraph["situation"], qa["question"]]
                full_context = " ".join(segment.strip()
                                        for segment in context_segments)

                if is_training:
                    answer = qa["answers"][0]
                    start_position_character = full_context.find(
                        answer["text"])
                    answer_text = answer["text"]
                    answers = []
                else:
                    start_position_character = None
                    answer_text = None
                    answers = [{
                        "text":
                        answer["text"],
                        "answer_start":
                        full_context.find(answer["text"])
                    } for answer in qa["answers"]]

                example = Example(
                    qas_id=qas_id,
                    question_text=question_text,
                    context_text=full_context,
                    answer_text=answer_text,
                    start_position_character=start_position_character,
                    title="",
                    is_impossible=False,
                    answers=answers,
                    background_text=paragraph["background"],
                    situation_text=paragraph["situation"],
                )
                examples.append(example)
        return examples
コード例 #9
0
ファイル: test_export_model.py プロジェクト: m-hahn/jiant
def test_export_model(tmp_path, model_type, model_class,
                      hf_pretrained_model_name_or_path):
    export_model(
        hf_pretrained_model_name_or_path=hf_pretrained_model_name_or_path,
        output_base_path=tmp_path,
    )
    read_config = py_io.read_json(os.path.join(tmp_path, f"config.json"))
    assert read_config["model_type"] == model_type
    assert read_config["model_path"] == os.path.join(tmp_path, "model",
                                                     f"{model_type}.p")
    assert read_config["model_config_path"] == os.path.join(
        tmp_path, "model", f"{model_type}.json")
コード例 #10
0
ファイル: core.py プロジェクト: HonoMi/jiant
    def run_from_parser_json_prepend(cls, parser, cl_args):
        parser.add_argument("--ZZsrc", type=str, action="append")
        parser.add_argument("--ZZoverrides", type=str, nargs="+")
        pre_args, _ = parser.parse_known_args(cl_args)
        if cl_args is None:
            cl_args = sys.argv[1:]
        if pre_args.ZZsrc is not None:
            # Import configs from ZZsrc JSONs
            imported_dict_ls = [read_json(path) for path in pre_args.ZZsrc]
            combined_imported_dict = combine_dicts(imported_dict_ls,
                                                   strict=True)

            # Record which args are going to be overridden

            if pre_args.ZZoverrides is not None:
                raw_overrides = pre_args.ZZoverrides
                overrides = [f"--{k}" for k in raw_overrides]
            else:
                raw_overrides = overrides = []

            attr_dict = cls.get_attr_dict()
            added_args = []
            for k, v in combined_imported_dict.items():
                formatted_k = f"--{k}"
                # Ensure that args from imported, which are not specified to be overridden,
                #   aren't explicitly specified
                if formatted_k in cl_args and formatted_k not in overrides:
                    raise RuntimeError(f"Attempting to override {formatted_k}")

                # Special handling for store_true args
                if cls._is_store_true_arg(attr_dict[k]):
                    if v and k not in raw_overrides:
                        added_args.append(formatted_k)
                else:
                    added_args.append(formatted_k)
                    added_args.append(str(v))
            submitted_args = added_args + cl_args
        else:
            assert pre_args.ZZoverrides is None
            submitted_args = cl_args
        update_parser(
            parser=parser,
            class_with_attributes=cls,
        )
        result, _ = read_parser(
            parser=parser,
            class_with_attributes=cls,
            skip_non_class_attributes=["ZZsrc", "ZZoverrides"],
            args=submitted_args,
        )
        assert isinstance(result, cls)
        return result
コード例 #11
0
def test_export_model(tmp_path, model_type, model_class, tokenizer_class, hf_model_name):
    export_model(
        model_type=model_type,
        output_base_path=tmp_path,
        model_class=model_class,
        tokenizer_class=tokenizer_class,
        hf_model_name=hf_model_name,
    )
    read_config = py_io.read_json(os.path.join(tmp_path, f"config.json"))
    assert read_config["model_type"] == model_type
    assert read_config["model_path"] == os.path.join(tmp_path, "model", f"{model_type}.p")
    assert read_config["model_config_path"] == os.path.join(tmp_path, "model", f"{model_type}.json")
    assert read_config["model_tokenizer_path"] == os.path.join(tmp_path, "tokenizer")
コード例 #12
0
def create_task_from_config_path(config_path: str, verbose: bool = False):
    """Creates task instance from task config filepath.

    Args:
        config_path (str): config filepath.
        verbose (bool): True if task config should be printed during task creation.

    Returns:
        Task instance.

    """
    return create_task_from_config(
        read_json(config_path), base_path=os.path.split(config_path)[0], verbose=verbose,
    )
コード例 #13
0
def test_simple_runscript(tmpdir, task_name, model_type):
    RUN_NAME = f"{test_simple_runscript.__name__}_{task_name}_{model_type}"
    data_dir = str(tmpdir.mkdir("data"))
    exp_dir = str(tmpdir.mkdir("exp"))

    downloader.download_data([task_name], data_dir)

    args = run.RunConfiguration(
        run_name=RUN_NAME,
        exp_dir=exp_dir,
        data_dir=data_dir,
        model_type=model_type,
        tasks=task_name,
        train_examples_cap=16,
        train_batch_size=16,
        no_cuda=True,
    )
    run.run_simple(args)

    val_metrics = py_io.read_json(os.path.join(exp_dir, "runs", RUN_NAME, "val_metrics.json"))
    assert val_metrics["aggregated"] > 0
コード例 #14
0
ファイル: runscript.py プロジェクト: dongfangyixi/jiant
def run_simple(args: RunConfiguration, with_continue: bool = False):
    hf_config = AutoConfig.from_pretrained(args.hf_pretrained_model_name_or_path)

    model_cache_path = replace_none(
        args.model_cache_path, default=os.path.join(args.exp_dir, "models")
    )

    with distributed.only_first_process(local_rank=args.local_rank):
        # === Step 1: Write task configs based on templates === #
        full_task_name_list = sorted(list(set(args.train_tasks + args.val_tasks + args.test_tasks)))
        task_config_path_dict = {}
        if args.create_config:
            task_config_path_dict = create_and_write_task_configs(
                task_name_list=full_task_name_list,
                data_dir=args.data_dir,
                task_config_base_path=os.path.join(args.data_dir, "configs"),
            )
        else:
            for task_name in full_task_name_list:
                task_config_path_dict[task_name] = os.path.join(
                    args.data_dir, "configs", f"{task_name}_config.json"
                )

        # === Step 2: Download models === #
        # if not os.path.exists(os.path.join(model_cache_path, hf_config.model_type)):
            # print("Downloading model")
            # export_model.export_model(
            #     hf_pretrained_model_name_or_path=args.hf_pretrained_model_name_or_path,
            #     output_base_path=os.path.join(model_cache_path, hf_config.model_type),
            # )

        # === Step 3: Tokenize and cache === #
        phase_task_dict = {
            "train": args.train_tasks,
            "val": args.val_tasks,
            "test": args.test_tasks,
        }
        for task_name in full_task_name_list:
            phases_to_do = []
            for phase, phase_task_list in phase_task_dict.items():
                if task_name in phase_task_list and not os.path.exists(
                    os.path.join(args.exp_dir, "cache", hf_config.model_type, task_name, phase)
                ):
                    config = read_json(task_config_path_dict[task_name])
                    if phase in config["paths"]:
                        phases_to_do.append(phase)
                    else:
                        phase_task_list.remove(task_name)
            if not phases_to_do:
                continue
            print(f"Tokenizing Task '{task_name}' for phases '{','.join(phases_to_do)}'")
            tokenize_and_cache.main(
                tokenize_and_cache.RunConfiguration(
                    task_config_path=task_config_path_dict[task_name],
                    hf_pretrained_model_name_or_path=args.hf_pretrained_model_name_or_path,
                    output_dir=os.path.join(args.exp_dir, "cache", hf_config.model_type, task_name),
                    phases=phases_to_do,
                    # TODO: Need a strategy for task-specific max_seq_length issues (issue #1176)
                    max_seq_length=args.max_seq_length,
                    smart_truncate=True,
                    do_iter=True,
                )
            )

    # === Step 4: Generate jiant_task_container_config === #
    # We'll do this with a configurator. Creating a jiant_task_config has a surprising
    # number of moving parts.
    jiant_task_container_config = configurator.SimpleAPIMultiTaskConfigurator(
        task_config_base_path=os.path.join(args.data_dir, "configs"),
        task_cache_base_path=os.path.join(args.exp_dir, "cache", hf_config.model_type),
        train_task_name_list=args.train_tasks,
        val_task_name_list=args.val_tasks,
        test_task_name_list=args.test_tasks,
        train_batch_size=args.train_batch_size,
        eval_batch_multiplier=2,
        epochs=args.num_train_epochs,
        num_gpus=torch.cuda.device_count(),
        train_examples_cap=args.train_examples_cap,
    ).create_config()
    os.makedirs(os.path.join(args.exp_dir, "run_configs"), exist_ok=True)
    jiant_task_container_config_path = os.path.join(
        args.exp_dir, "run_configs", f"{args.run_name}_config.json"
    )
    py_io.write_json(jiant_task_container_config, path=jiant_task_container_config_path)

    # === Step 5: Train/Eval! === #
    if args.model_weights_path:
        model_load_mode = "partial"
        model_weights_path = args.model_weights_path
    else:
        # From Transformers
        if any(task_name.startswith("mlm_") for task_name in full_task_name_list):
            model_load_mode = "from_transformers_with_mlm"
        else:
            model_load_mode = "from_transformers"
        model_weights_path = os.path.join(
            model_cache_path, hf_config.model_type, "model", "model.p"
        )
    run_output_dir = os.path.join(args.exp_dir, "runs", args.run_name)

    if (
        args.save_checkpoint_every_steps
        and os.path.exists(os.path.join(run_output_dir, "checkpoint.p"))
        and with_continue
    ):
        print("Resuming")
        checkpoint = torch.load(os.path.join(run_output_dir, "checkpoint.p"))
        run_args = runscript.RunConfiguration.from_dict(checkpoint["metadata"]["args"])
    else:
        print("Running from start")
        run_args = runscript.RunConfiguration(
            # === Required parameters === #
            jiant_task_container_config_path=jiant_task_container_config_path,
            output_dir=run_output_dir,
            # === Model parameters === #
            hf_pretrained_model_name_or_path=args.hf_pretrained_model_name_or_path,
            model_path=model_weights_path,
            model_config_path=os.path.join(
                model_cache_path, hf_config.model_type, "model", "config.json",
            ),
            model_load_mode=model_load_mode,
            # === Running Setup === #
            do_train=bool(args.train_tasks),
            do_val=bool(args.val_tasks),
            do_save=args.do_save,
            do_save_best=args.do_save_best,
            do_save_last=args.do_save_last,
            write_val_preds=args.write_val_preds,
            write_test_preds=args.write_test_preds,
            eval_every_steps=args.eval_every_steps,
            save_every_steps=args.save_every_steps,
            save_checkpoint_every_steps=args.save_checkpoint_every_steps,
            no_improvements_for_n_evals=args.no_improvements_for_n_evals,
            keep_checkpoint_when_done=args.keep_checkpoint_when_done,
            force_overwrite=args.force_overwrite,
            seed=args.seed,
            # === Training Learning Parameters === #
            learning_rate=args.learning_rate,
            adam_epsilon=args.adam_epsilon,
            max_grad_norm=args.max_grad_norm,
            optimizer_type=args.optimizer_type,
            # === Specialized config === #
            no_cuda=args.no_cuda,
            fp16=args.fp16,
            fp16_opt_level=args.fp16_opt_level,
            local_rank=args.local_rank,
            server_ip=args.server_ip,
            server_port=args.server_port,
        )
        checkpoint = None

    runscript.run_loop(args=run_args, checkpoint=checkpoint)
    py_io.write_file(args.to_json(), os.path.join(run_output_dir, "simple_run_config.json"))
コード例 #15
0
def simple_multi_task_config(
    task_meta_config_dict,
    task_cache_dict,
    task_name_list=None,
    epochs=None,
    max_steps=None,
    num_gpus=1,
    train_examples_cap=None,
    warmup_steps_proportion=0.1,
):
    if isinstance(task_meta_config_dict, str):
        task_meta_config_dict = py_io.read_json(os.path.expandvars(task_meta_config_dict))
    if isinstance(task_cache_dict, str):
        task_cache_dict = py_io.read_json(os.path.expandvars(task_cache_dict))
    if task_name_list is None:
        task_name_list = sorted(list(task_meta_config_dict))

    assert (epochs is None) != (max_steps is None)

    # Proportional
    num_examples_dict = {}
    capped_num_examples_dict = {}
    max_steps_not_given = max_steps is None
    print(max_steps_not_given)
    if max_steps_not_given:
        assert isinstance(epochs, (int, float))
        max_steps = 0
    for task_name in task_name_list:
        effective_batch_size = (
            task_meta_config_dict[task_name]["train_batch_size"]
            * task_meta_config_dict[task_name]["gradient_accumulation_steps"]
            * num_gpus
        )
        num_examples = get_num_examples_from_cache(
            cache_path=os.path.expandvars(task_cache_dict[task_name]["train"]),
        )
        capped_num_examples = cap_examples(num_examples=num_examples, cap=train_examples_cap)
        num_examples_dict[task_name] = num_examples
        capped_num_examples_dict[task_name] = capped_num_examples
        if max_steps_not_given:
            max_steps += num_examples * epochs // effective_batch_size

    if train_examples_cap is None:
        sampler_config = {
            "sampler_type": "ProportionalMultiTaskSampler",
        }
    else:
        sampler_config = {
            "sampler_type": "SpecifiedProbMultiTaskSampler",
            "task_to_unweighted_probs": capped_num_examples_dict,
        }

    config_dict = {
        "task_config_path_dict": {
            task_name: os.path.expandvars(task_meta_config_dict[task_name]["config_path"])
            for task_name in task_name_list
        },
        "task_cache_config_dict": {
            task_name: {
                "train": os.path.expandvars(task_cache_dict[task_name]["train"]),
                "val": os.path.expandvars(task_cache_dict[task_name]["val"]),
                "val_labels": os.path.expandvars(task_cache_dict[task_name]["val_labels"]),
            }
            for task_name in task_name_list
        },
        "sampler_config": sampler_config,
        "global_train_config": {
            "max_steps": max_steps,
            "warmup_steps": int(max_steps * warmup_steps_proportion),
        },
        "task_specific_configs_dict": {
            task_name: {
                "train_batch_size": task_meta_config_dict[task_name]["train_batch_size"],
                "eval_batch_size": task_meta_config_dict[task_name]["eval_batch_size"],
                "gradient_accumulation_steps": task_meta_config_dict[task_name][
                    "gradient_accumulation_steps"
                ],
                "eval_subset_num": task_meta_config_dict[task_name]["eval_subset_num"],
            }
            for task_name in task_name_list
        },
        "taskmodels_config": {
            "task_to_taskmodel_map": {
                task_name: task_meta_config_dict[task_name]["task_to_taskmodel_map"]
                for task_name in task_name_list
            },
            "taskmodel_config_map": {task_name: None for task_name in task_name_list},
        },
        "task_run_config": {
            "train_task_list": task_name_list,
            "train_val_task_list": task_name_list,
            "val_task_list": task_name_list,
            "test_task_list": task_name_list,
        },
        "metric_aggregator_config": {"metric_aggregator_type": "EqualMetricAggregator"},
    }
    return config_dict
コード例 #16
0
    def create_config(self):
        # === Gather task names === #
        # Get the full list of tasks across all phases
        task_name_list_dict = {
            "train": self.parse_task_name_list(self.train_task_name_list),
            "val": self.parse_task_name_list(self.val_task_name_list),
            "test": self.parse_task_name_list(self.test_task_name_list),
        }
        if self.train_val_task_name_list is None:
            task_name_list_dict["train_val"] = task_name_list_dict["train"]
        else:
            task_name_list_dict["train_val"] = self.parse_task_name_list(
                self.train_val_task_name_list)
        full_task_name_list = py_datastructures.get_unique_list_in_order(
            task_name_list_dict.values())

        # === Gather task configs === #
        # Build task_config_path_dict, either via
        #   1. task_config_base_path: where all caches are contained within a given folder
        #   2. task_config_dict: explicitly provided dictionary to cache paths, potentially in JSON
        # Use dictionary directly, or load from JSON
        if self.task_config_base_path is not None:
            assert self.task_config_path_dict is None
            task_config_path_dict = {
                task_name: os.path.join(self.task_config_base_path,
                                        f"{task_name}_config.json")
                for task_name in full_task_name_list
            }
        else:
            if isinstance(self.task_config_path_dict, str):
                task_config_path_dict = py_io.read_json(
                    os.path.expandvars(self.task_config_path_dict))
            else:
                task_config_path_dict = self.task_config_path_dict

        # === Gather cache === #
        # Build task_cache_base_path, either via
        #   1. task_cache_base_path: where all caches are contained within a given folder
        #   2. task_cache_config_dict: explicitly provided dictionary to cache paths,
        #                              potentially in JSON
        if self.task_cache_base_path is not None:
            assert self.task_cache_config_dict is None
            task_cache_config_dict = {}
            for task_name in full_task_name_list:
                task_cache_config_dict[task_name] = {}
                if task_name in task_name_list_dict["train"]:
                    task_cache_config_dict[task_name]["train"] = os.path.join(
                        self.task_cache_base_path,
                        task_name,
                        "train",
                    )
                if (task_name in task_name_list_dict["train_val"]
                        or task_name in task_name_list_dict["val"]):
                    task_cache_config_dict[task_name]["val"] = os.path.join(
                        self.task_cache_base_path,
                        task_name,
                        "val",
                    )
                    task_cache_config_dict[task_name][
                        "val_labels"] = os.path.join(
                            self.task_cache_base_path,
                            task_name,
                            "val_labels",
                        )
                if task_name in task_name_list_dict["test"]:
                    task_cache_config_dict[task_name]["test"] = os.path.join(
                        self.task_cache_base_path,
                        task_name,
                        "test",
                    )
        elif isinstance(self.task_cache_config_dict, str):
            assert self.task_cache_base_path is None
            task_cache_config_dict = py_io.read_json(
                self.task_cache_config_dict)
        elif isinstance(task_config_path_dict, dict):
            task_cache_config_dict = self.task_cache_config_dict
        else:
            raise RuntimeError(
                "Need 'task_cache_base_path' or 'task_cache_dict'")

        # === Compute training steps === #
        # Computing the number of training steps across multiple tasks is slightly
        # trickier than expected (unless max_steps is explicitly provided)
        # We need to get the number of examples for each task, divide by the
        # effective batch size (batch size per gpu * grad accum steps * number of gpus)
        # AND consider a common use-case where we cap the number of examples from a given task
        assert (self.epochs is None) != (
            self.max_steps is None), "Specify only 'epochs' or 'max_steps'"
        num_examples_dict = {}
        capped_num_examples_dict = {}
        max_steps_not_given = self.max_steps is None
        if max_steps_not_given:
            assert isinstance(self.epochs, (int, float))
            max_steps = 0
        else:
            max_steps = self.max_steps
        for task_name in task_name_list_dict["train"]:
            if self.num_gpus:
                # We multiply by num_gpus because 1 step is done across (potentially) multiple GPUs
                effective_batch_size = (self.train_batch_size *
                                        self.gradient_accumulation_steps *
                                        self.num_gpus)
            else:
                effective_batch_size = self.train_batch_size * self.gradient_accumulation_steps
            num_examples = get_num_examples_from_cache(
                cache_path=os.path.expandvars(
                    task_cache_config_dict[task_name]["train"]), )
            capped_num_examples = cap_examples(num_examples=num_examples,
                                               cap=self.train_examples_cap)
            num_examples_dict[task_name] = num_examples
            capped_num_examples_dict[task_name] = capped_num_examples
            if max_steps_not_given:
                max_steps += self.epochs * math.ceil(
                    capped_num_examples / effective_batch_size)

        # === Compute eval_batch_size === #
        # Eval batch size is often a multiple of train batch size,
        #   so we provide 2 ways to specify it
        assert (self.eval_batch_size is None) != (
            self.eval_batch_multiplier is
            None), "Specify only 'eval_batch_size' or 'eval_batch_multiplier'"
        if self.eval_batch_multiplier is not None:
            eval_batch_size = self.train_batch_size * self.eval_batch_multiplier
        else:
            eval_batch_size = self.eval_batch_size

        # === Configure Sampler === #
        # We sample proportionally by default, unless our training examples are capped per task
        if self.train_examples_cap is None:
            sampler_config = {
                "sampler_type": "ProportionalMultiTaskSampler",
            }
        else:
            sampler_config = {
                "sampler_type": "SpecifiedProbMultiTaskSampler",
                "task_to_unweighted_probs": capped_num_examples_dict,
            }

        # === Build configuration === #
        # Finally, we build our big config dictionary. Congrats!
        config_dict = {
            "task_config_path_dict": task_config_path_dict,
            "task_cache_config_dict": task_cache_config_dict,
            "sampler_config": sampler_config,
            "global_train_config": {
                "max_steps": int(max_steps),
                "warmup_steps": int(max_steps * self.warmup_steps_proportion),
            },
            "task_specific_configs_dict": {
                task_name: {
                    "train_batch_size": self.train_batch_size,
                    "eval_batch_size": eval_batch_size,
                    "gradient_accumulation_steps":
                    self.gradient_accumulation_steps,
                    "eval_subset_num": self.eval_subset_num,
                }
                for task_name in full_task_name_list
            },
            "taskmodels_config": {
                "task_to_taskmodel_map":
                {task_name: task_name
                 for task_name in full_task_name_list},
                "taskmodel_config_map":
                {task_name: None
                 for task_name in full_task_name_list},
            },
            "task_run_config": {
                "train_task_list": task_name_list_dict["train"],
                "train_val_task_list": task_name_list_dict["train_val"],
                "val_task_list": task_name_list_dict["val"],
                "test_task_list": task_name_list_dict["test"],
            },
            "metric_aggregator_config": {
                "metric_aggregator_type": "EqualMetricAggregator"
            },
        }
        return config_dict
コード例 #17
0
def write_configs_from_full(full_config_path):
    write_configs(
        config_dict=py_io.read_json(full_config_path), base_path=os.path.split(full_config_path)[0],
    )
コード例 #18
0
def setup_runner(
    args: RunConfiguration,
    jiant_task_container: container_setup.JiantTaskContainer,
    quick_init_out,
    verbose: bool = True,
) -> jiant_runner.JiantRunner:
    """Setup jiant model, optimizer, and runner, and return runner.

    Args:
        args (RunConfiguration): configuration carrying command line args specifying run params.
        jiant_task_container (container_setup.JiantTaskContainer): task and sampler configs.
        quick_init_out (QuickInitContainer): device (GPU/CPU) and logging configuration.
        verbose: If True, enables printing configuration info (to standard out).

    Returns:
        jiant_runner.JiantRunner

    """
    # TODO document why the distributed.only_first_process() context manager is being used here.
    jiant_model = jiant_model_setup.setup_jiant_model(
        model_type=args.model_type,
        model_config_path=args.model_config_path,
        tokenizer_path=args.model_tokenizer_path,
        task_dict=jiant_task_container.task_dict,
        taskmodels_config=jiant_task_container.taskmodels_config,
    )
    weights_dict = torch.load(args.model_path)
    jiant_model_setup.load_encoder_from_transformers_weights(
        encoder=jiant_model.encoder, weights_dict=weights_dict,
    )
    if args.adapter_config_path:
        adapter_config = adapters_modeling.AdapterConfig.from_dict(
            py_io.read_json(args.adapter_config_path),
        )
    else:
        adapter_config = adapters_modeling.AdapterConfig()
    adapters_modeling.add_shared_adapters_to_jiant_model(
        jiant_model=jiant_model, adapter_config=adapter_config,
    )
    if args.adapters_load_mode and args.adapters_load_path:
        adapters_modeling.delegate_load_for_shared_adapters(
            jiant_model=jiant_model,
            state_dict=torch.load(args.adapters_load_path),
            load_mode=args.adapters_load_mode,
        )
    jiant_model.to(quick_init_out.device)

    (
        optimized_named_parameters,
        _,
    ) = adapters_modeling.get_optimized_named_parameters_for_jiant_model_with_adapters(
        jiant_model=jiant_model,
    )
    optimizer_scheduler = model_setup.create_optimizer_from_params(
        named_parameters=optimized_named_parameters,
        learning_rate=args.learning_rate,
        t_total=jiant_task_container.global_train_config.max_steps,
        warmup_steps=jiant_task_container.global_train_config.warmup_steps,
        warmup_proportion=None,
        verbose=verbose,
    )
    jiant_model, optimizer = model_setup.raw_special_model_setup(
        model=jiant_model,
        optimizer=optimizer_scheduler.optimizer,
        fp16=args.fp16,
        fp16_opt_level=args.fp16_opt_level,
        n_gpu=quick_init_out.n_gpu,
        local_rank=args.local_rank,
    )
    optimizer_scheduler.optimizer = optimizer
    rparams = jiant_runner.RunnerParameters(
        local_rank=args.local_rank,
        n_gpu=quick_init_out.n_gpu,
        fp16=args.fp16,
        max_grad_norm=args.max_grad_norm,
    )
    runner = jiant_runner.JiantRunner(
        jiant_task_container=jiant_task_container,
        jiant_model=jiant_model,
        optimizer_scheduler=optimizer_scheduler,
        device=quick_init_out.device,
        rparams=rparams,
        log_writer=quick_init_out.log_writer,
    )
    return runner
コード例 #19
0
ファイル: xtreme.py プロジェクト: HonoMi/jiant
def download_tydiqa_data_and_write_config(task_data_base_path: str,
                                          task_config_base_path: str):
    tydiqa_temp_path = py_io.create_dir(task_data_base_path, "tydiqa_temp")
    full_train_path = os.path.join(tydiqa_temp_path,
                                   "tydiqa-goldp-v1.1-train.json")
    download_utils.download_file(
        "https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-train.json",
        full_train_path,
    )
    download_utils.download_and_untar(
        "https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-dev.tgz",
        tydiqa_temp_path,
    )
    languages_dict = {
        "arabic": "ar",
        "bengali": "bn",
        "english": "en",
        "finnish": "fi",
        "indonesian": "id",
        "korean": "ko",
        "russian": "ru",
        "swahili": "sw",
        "telugu": "te",
    }

    # Split train data
    data = py_io.read_json(full_train_path)
    lang2data = {lang: [] for lang in languages_dict.values()}
    for doc in data["data"]:
        for par in doc["paragraphs"]:
            context = par["context"]
            for qa in par["qas"]:
                question = qa["question"]
                question_id = qa["id"]
                example_lang = languages_dict[question_id.split("-")[0]]
                q_id = question_id.split("-")[-1]
                for answer in qa["answers"]:
                    a_start, a_text = answer["answer_start"], answer["text"]
                    a_end = a_start + len(a_text)
                    assert context[a_start:a_end] == a_text
                lang2data[example_lang].append({
                    "paragraphs": [{
                        "context":
                        context,
                        "qas": [{
                            "answers": qa["answers"],
                            "question": question,
                            "id": q_id
                        }],
                    }]
                })

    for full_lang, lang in languages_dict.items():
        task_name = f"tydiqa_{lang}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        train_path = os.path.join(task_data_path, f"tydiqa.{lang}.train.json")
        py_io.write_json(
            data=data,
            path=train_path,
            skip_if_exists=True,
        )
        val_path = os.path.join(task_data_path, f"tydiqa.{lang}.dev.json")
        os.rename(
            src=os.path.join(tydiqa_temp_path, "tydiqa-goldp-v1.1-dev",
                             f"tydiqa-goldp-dev-{full_lang}.json"),
            dst=val_path,
        )
        py_io.write_json(
            data={
                "task": "tydiqa",
                "paths": {
                    "train": train_path,
                    "val": val_path
                },
                "kwargs": {
                    "language": lang
                },
                "name": task_name,
            },
            path=os.path.join(task_config_base_path,
                              f"{task_name}_config.json"),
            skip_if_exists=True,
        )
    shutil.rmtree(tydiqa_temp_path)
コード例 #20
0
 def get_tags_to_id(self):
     tags_to_id = read_json(self.path_dict["tags_to_id"])
     tags_to_id = {k: int(v) for k, v in tags_to_id.items()}
     return tags_to_id
コード例 #21
0
ファイル: make_config.py プロジェクト: zphang/nyu-jiant
def single_task_config(task_config_path,
                       train_batch_size=None,
                       task_cache_base_path=None,
                       task_cache_train_path=None,
                       task_cache_val_path=None,
                       task_cache_val_labels_path=None,
                       epochs=None,
                       max_steps=None,
                       eval_batch_multiplier=2,
                       eval_batch_size=None,
                       gradient_accumulation_steps=1,
                       eval_subset_num=500,
                       warmup_steps_proportion=0.1,
                       phases=("train", "val")):
    task_config = py_io.read_json(os.path.expandvars(task_config_path))
    task_name = task_config["name"]

    do_train = "train" in phases
    do_val = "val" in phases

    cache_path_dict = {}
    if do_train:
        if task_cache_train_path is None:
            task_cache_train_path = os.path.join(task_cache_base_path, "train")
        cache_path_dict["train"] = os.path.expandvars(task_cache_train_path)

    if do_val:
        if task_cache_val_path is None:
            task_cache_val_path = os.path.join(task_cache_base_path, "val")
        if task_cache_val_labels_path is None:
            task_cache_val_labels_path = os.path.join(task_cache_base_path,
                                                      "val_labels")
        cache_path_dict["val"] = os.path.expandvars(task_cache_val_path)
        cache_path_dict["val_labels"] = os.path.expandvars(
            task_cache_val_labels_path)

    if do_train:
        assert (epochs is None) != (max_steps is None)
        assert train_batch_size is not None
        effective_batch_size = train_batch_size * gradient_accumulation_steps
        num_training_examples = get_num_examples_from_cache(
            cache_path=os.path.expandvars(task_cache_train_path), )
        max_steps = num_training_examples * epochs // effective_batch_size
    else:
        max_steps = 0
        train_batch_size = 0

    if do_val:
        if eval_batch_size is None:
            assert train_batch_size is not None
            eval_batch_size = train_batch_size * eval_batch_multiplier

    config_dict = {
        "task_config_path_dict": {
            task_name: os.path.expandvars(task_config_path),
        },
        "task_cache_config_dict": {
            task_name: cache_path_dict,
        },
        "sampler_config": {
            "sampler_type": "UniformMultiTaskSampler",
        },
        "global_train_config": {
            "max_steps": max_steps,
            "warmup_steps": int(max_steps * warmup_steps_proportion),
        },
        "task_specific_configs_dict": {
            task_name: {
                "train_batch_size": train_batch_size,
                "eval_batch_size": eval_batch_size,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "eval_subset_num": eval_subset_num,
            },
        },
        "taskmodels_config": {
            "task_to_taskmodel_map": {
                task_name: task_name,
            },
            "taskmodel_config_map": {
                task_name: None,
            }
        },
        "task_run_config": {
            "train_task_list": [task_name] if do_train else [],
            "train_val_task_list": [task_name] if do_train else [],
            "val_task_list": [task_name] if do_val else [],
            "test_task_list": [],
        },
        "metric_aggregator_config": {
            "metric_aggregator_type": "EqualMetricAggregator",
        },
    }
    return config_dict