def test_get_unique_list_in_order():
    assert py_datastructures.get_unique_list_in_order([[1, 2], [3],
                                                       [4]]) == [1, 2, 3, 4]
    assert py_datastructures.get_unique_list_in_order([[1, 2, 3], [3],
                                                       [4]]) == [1, 2, 3, 4]
    assert py_datastructures.get_unique_list_in_order([[1, 2, 3], [4],
                                                       [3]]) == [1, 2, 3, 4]
Exemple #2
0
    def create_config(self):
        # === Gather task names === #
        # Get the full list of tasks across all phases
        task_name_list_dict = {
            "train": self.parse_task_name_list(self.train_task_name_list),
            "val": self.parse_task_name_list(self.val_task_name_list),
            "test": self.parse_task_name_list(self.test_task_name_list),
        }
        if self.train_val_task_name_list is None:
            task_name_list_dict["train_val"] = task_name_list_dict["train"]
        else:
            task_name_list_dict["train_val"] = self.parse_task_name_list(
                self.train_val_task_name_list)
        full_task_name_list = py_datastructures.get_unique_list_in_order(
            task_name_list_dict.values())

        # === Gather task configs === #
        # Build task_config_path_dict, either via
        #   1. task_config_base_path: where all caches are contained within a given folder
        #   2. task_config_dict: explicitly provided dictionary to cache paths, potentially in JSON
        # Use dictionary directly, or load from JSON
        if self.task_config_base_path is not None:
            assert self.task_config_path_dict is None
            task_config_path_dict = {
                task_name: os.path.join(self.task_config_base_path,
                                        f"{task_name}_config.json")
                for task_name in full_task_name_list
            }
        else:
            if isinstance(self.task_config_path_dict, str):
                task_config_path_dict = py_io.read_json(
                    os.path.expandvars(self.task_config_path_dict))
            else:
                task_config_path_dict = self.task_config_path_dict

        # === Gather cache === #
        # Build task_cache_base_path, either via
        #   1. task_cache_base_path: where all caches are contained within a given folder
        #   2. task_cache_config_dict: explicitly provided dictionary to cache paths,
        #                              potentially in JSON
        if self.task_cache_base_path is not None:
            assert self.task_cache_config_dict is None
            task_cache_config_dict = {}
            for task_name in full_task_name_list:
                task_cache_config_dict[task_name] = {}
                if task_name in task_name_list_dict["train"]:
                    task_cache_config_dict[task_name]["train"] = os.path.join(
                        self.task_cache_base_path,
                        task_name,
                        "train",
                    )
                if (task_name in task_name_list_dict["train_val"]
                        or task_name in task_name_list_dict["val"]):
                    task_cache_config_dict[task_name]["val"] = os.path.join(
                        self.task_cache_base_path,
                        task_name,
                        "val",
                    )
                    task_cache_config_dict[task_name][
                        "val_labels"] = os.path.join(
                            self.task_cache_base_path,
                            task_name,
                            "val_labels",
                        )
                if task_name in task_name_list_dict["test"]:
                    task_cache_config_dict[task_name]["test"] = os.path.join(
                        self.task_cache_base_path,
                        task_name,
                        "test",
                    )
        elif isinstance(self.task_cache_config_dict, str):
            assert self.task_cache_base_path is None
            task_cache_config_dict = py_io.read_json(
                self.task_cache_config_dict)
        elif isinstance(task_config_path_dict, dict):
            task_cache_config_dict = self.task_cache_config_dict
        else:
            raise RuntimeError(
                "Need 'task_cache_base_path' or 'task_cache_dict'")

        # === Compute training steps === #
        # Computing the number of training steps across multiple tasks is slightly
        # trickier than expected (unless max_steps is explicitly provided)
        # We need to get the number of examples for each task, divide by the
        # effective batch size (batch size per gpu * grad accum steps * number of gpus)
        # AND consider a common use-case where we cap the number of examples from a given task
        assert (self.epochs is None) != (
            self.max_steps is None), "Specify only 'epochs' or 'max_steps'"
        num_examples_dict = {}
        capped_num_examples_dict = {}
        max_steps_not_given = self.max_steps is None
        if max_steps_not_given:
            assert isinstance(self.epochs, (int, float))
            max_steps = 0
        else:
            max_steps = self.max_steps
        for task_name in task_name_list_dict["train"]:
            if self.num_gpus:
                # We multiply by num_gpus because 1 step is done across (potentially) multiple GPUs
                effective_batch_size = (self.train_batch_size *
                                        self.gradient_accumulation_steps *
                                        self.num_gpus)
            else:
                effective_batch_size = self.train_batch_size * self.gradient_accumulation_steps
            num_examples = get_num_examples_from_cache(
                cache_path=os.path.expandvars(
                    task_cache_config_dict[task_name]["train"]), )
            capped_num_examples = cap_examples(num_examples=num_examples,
                                               cap=self.train_examples_cap)
            num_examples_dict[task_name] = num_examples
            capped_num_examples_dict[task_name] = capped_num_examples
            if max_steps_not_given:
                max_steps += self.epochs * math.ceil(
                    capped_num_examples / effective_batch_size)

        # === Compute eval_batch_size === #
        # Eval batch size is often a multiple of train batch size,
        #   so we provide 2 ways to specify it
        assert (self.eval_batch_size is None) != (
            self.eval_batch_multiplier is
            None), "Specify only 'eval_batch_size' or 'eval_batch_multiplier'"
        if self.eval_batch_multiplier is not None:
            eval_batch_size = self.train_batch_size * self.eval_batch_multiplier
        else:
            eval_batch_size = self.eval_batch_size

        # === Configure Sampler === #
        # We sample proportionally by default, unless our training examples are capped per task
        if self.train_examples_cap is None:
            sampler_config = {
                "sampler_type": "ProportionalMultiTaskSampler",
            }
        else:
            sampler_config = {
                "sampler_type": "SpecifiedProbMultiTaskSampler",
                "task_to_unweighted_probs": capped_num_examples_dict,
            }

        # === Build configuration === #
        # Finally, we build our big config dictionary. Congrats!
        config_dict = {
            "task_config_path_dict": task_config_path_dict,
            "task_cache_config_dict": task_cache_config_dict,
            "sampler_config": sampler_config,
            "global_train_config": {
                "max_steps": int(max_steps),
                "warmup_steps": int(max_steps * self.warmup_steps_proportion),
            },
            "task_specific_configs_dict": {
                task_name: {
                    "train_batch_size": self.train_batch_size,
                    "eval_batch_size": eval_batch_size,
                    "gradient_accumulation_steps":
                    self.gradient_accumulation_steps,
                    "eval_subset_num": self.eval_subset_num,
                }
                for task_name in full_task_name_list
            },
            "taskmodels_config": {
                "task_to_taskmodel_map":
                {task_name: task_name
                 for task_name in full_task_name_list},
                "taskmodel_config_map":
                {task_name: None
                 for task_name in full_task_name_list},
            },
            "task_run_config": {
                "train_task_list": task_name_list_dict["train"],
                "train_val_task_list": task_name_list_dict["train_val"],
                "val_task_list": task_name_list_dict["val"],
                "test_task_list": task_name_list_dict["test"],
            },
            "metric_aggregator_config": {
                "metric_aggregator_type": "EqualMetricAggregator"
            },
        }
        return config_dict
Exemple #3
0
def generate_configs(args: RunConfiguration):
    xtreme_task = args.xtreme_task
    if xtreme_task == "mlqa":
        xtreme_task_name_list = [
            f"{xtreme_task}_{lang}_{lang}" for lang in LANGS_DICT[xtreme_task]
        ]
    else:
        xtreme_task_name_list = [
            f"{xtreme_task}_{lang}" for lang in LANGS_DICT[xtreme_task]
        ]

    if xtreme_task in TRAINED_TASKS:
        train_task = TRAIN_TASK_DICT[xtreme_task]
        train_task_name_list = [train_task]
        val_task_name_list = get_unique_list_in_order(
            [xtreme_task_name_list, train_task_name_list])
        if args.early_stop_on_xtreme_tasks:
            train_val_task_name_list = val_task_name_list
        else:
            train_val_task_name_list = train_task_name_list
    elif xtreme_task in UNTRAINED_TASKS:
        train_task_name_list = []
        val_task_name_list = xtreme_task_name_list
        train_val_task_name_list = []
    else:
        raise KeyError(xtreme_task)

    if xtreme_task == "udpos":
        test_task_name_list = xtreme_task_name_list + [
            f"udpos_{lang}" for lang in EXTRA_UDPOS_TEST_LANGS
        ]
    elif xtreme_task in ["xquad", "tydiqa", "tatoeba"]:
        test_task_name_list = []
    else:
        test_task_name_list = xtreme_task_name_list

    if not args.suppress_print:
        print("Training on:", ", ".join(train_task_name_list))
        print("Validation on:", ", ".join(val_task_name_list))
        print("Early stopping on:", ", ".join(train_val_task_name_list))
        print("Testing on:", ",".join(test_task_name_list))

    config = configurator.SimpleAPIMultiTaskConfigurator(
        task_config_base_path=args.task_config_base_path,
        task_cache_base_path=args.task_cache_base_path,
        train_task_name_list=train_task_name_list,
        train_val_task_name_list=train_val_task_name_list,
        val_task_name_list=val_task_name_list,
        test_task_name_list=test_task_name_list,
        epochs=args.epochs,
        train_batch_size=args.train_batch_size,
        eval_batch_multiplier=args.eval_batch_multiplier,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        eval_subset_num=args.eval_subset_num,
        num_gpus=args.num_gpus,
        warmup_steps_proportion=args.warmup_steps_proportion,
    ).create_config()

    # Make sure all tasks use the same task head
    config["taskmodels_config"]["task_to_taskmodel_map"] = {
        k: xtreme_task
        for k, v in config["taskmodels_config"]
        ["task_to_taskmodel_map"].items()
    }
    if not args.suppress_print:
        print(f"Assigning all tasks to '{xtreme_task}' head")
    if xtreme_task in UNTRAINED_TASKS:
        # The reference implementation from the XTREME paper uses layer 14 for the
        #  retrieval representation.
        config["taskmodels_config"]["taskmodel_config_map"] = {
            xtreme_task: {
                "pooler_type": "mean",
                "layer": args.retrieval_layer
            }
        }

    py_io.write_json(config, args.output_path)