def test_get_unique_list_in_order(): assert py_datastructures.get_unique_list_in_order([[1, 2], [3], [4]]) == [1, 2, 3, 4] assert py_datastructures.get_unique_list_in_order([[1, 2, 3], [3], [4]]) == [1, 2, 3, 4] assert py_datastructures.get_unique_list_in_order([[1, 2, 3], [4], [3]]) == [1, 2, 3, 4]
def create_config(self): # === Gather task names === # # Get the full list of tasks across all phases task_name_list_dict = { "train": self.parse_task_name_list(self.train_task_name_list), "val": self.parse_task_name_list(self.val_task_name_list), "test": self.parse_task_name_list(self.test_task_name_list), } if self.train_val_task_name_list is None: task_name_list_dict["train_val"] = task_name_list_dict["train"] else: task_name_list_dict["train_val"] = self.parse_task_name_list( self.train_val_task_name_list) full_task_name_list = py_datastructures.get_unique_list_in_order( task_name_list_dict.values()) # === Gather task configs === # # Build task_config_path_dict, either via # 1. task_config_base_path: where all caches are contained within a given folder # 2. task_config_dict: explicitly provided dictionary to cache paths, potentially in JSON # Use dictionary directly, or load from JSON if self.task_config_base_path is not None: assert self.task_config_path_dict is None task_config_path_dict = { task_name: os.path.join(self.task_config_base_path, f"{task_name}_config.json") for task_name in full_task_name_list } else: if isinstance(self.task_config_path_dict, str): task_config_path_dict = py_io.read_json( os.path.expandvars(self.task_config_path_dict)) else: task_config_path_dict = self.task_config_path_dict # === Gather cache === # # Build task_cache_base_path, either via # 1. task_cache_base_path: where all caches are contained within a given folder # 2. task_cache_config_dict: explicitly provided dictionary to cache paths, # potentially in JSON if self.task_cache_base_path is not None: assert self.task_cache_config_dict is None task_cache_config_dict = {} for task_name in full_task_name_list: task_cache_config_dict[task_name] = {} if task_name in task_name_list_dict["train"]: task_cache_config_dict[task_name]["train"] = os.path.join( self.task_cache_base_path, task_name, "train", ) if (task_name in task_name_list_dict["train_val"] or task_name in task_name_list_dict["val"]): task_cache_config_dict[task_name]["val"] = os.path.join( self.task_cache_base_path, task_name, "val", ) task_cache_config_dict[task_name][ "val_labels"] = os.path.join( self.task_cache_base_path, task_name, "val_labels", ) if task_name in task_name_list_dict["test"]: task_cache_config_dict[task_name]["test"] = os.path.join( self.task_cache_base_path, task_name, "test", ) elif isinstance(self.task_cache_config_dict, str): assert self.task_cache_base_path is None task_cache_config_dict = py_io.read_json( self.task_cache_config_dict) elif isinstance(task_config_path_dict, dict): task_cache_config_dict = self.task_cache_config_dict else: raise RuntimeError( "Need 'task_cache_base_path' or 'task_cache_dict'") # === Compute training steps === # # Computing the number of training steps across multiple tasks is slightly # trickier than expected (unless max_steps is explicitly provided) # We need to get the number of examples for each task, divide by the # effective batch size (batch size per gpu * grad accum steps * number of gpus) # AND consider a common use-case where we cap the number of examples from a given task assert (self.epochs is None) != ( self.max_steps is None), "Specify only 'epochs' or 'max_steps'" num_examples_dict = {} capped_num_examples_dict = {} max_steps_not_given = self.max_steps is None if max_steps_not_given: assert isinstance(self.epochs, (int, float)) max_steps = 0 else: max_steps = self.max_steps for task_name in task_name_list_dict["train"]: if self.num_gpus: # We multiply by num_gpus because 1 step is done across (potentially) multiple GPUs effective_batch_size = (self.train_batch_size * self.gradient_accumulation_steps * self.num_gpus) else: effective_batch_size = self.train_batch_size * self.gradient_accumulation_steps num_examples = get_num_examples_from_cache( cache_path=os.path.expandvars( task_cache_config_dict[task_name]["train"]), ) capped_num_examples = cap_examples(num_examples=num_examples, cap=self.train_examples_cap) num_examples_dict[task_name] = num_examples capped_num_examples_dict[task_name] = capped_num_examples if max_steps_not_given: max_steps += self.epochs * math.ceil( capped_num_examples / effective_batch_size) # === Compute eval_batch_size === # # Eval batch size is often a multiple of train batch size, # so we provide 2 ways to specify it assert (self.eval_batch_size is None) != ( self.eval_batch_multiplier is None), "Specify only 'eval_batch_size' or 'eval_batch_multiplier'" if self.eval_batch_multiplier is not None: eval_batch_size = self.train_batch_size * self.eval_batch_multiplier else: eval_batch_size = self.eval_batch_size # === Configure Sampler === # # We sample proportionally by default, unless our training examples are capped per task if self.train_examples_cap is None: sampler_config = { "sampler_type": "ProportionalMultiTaskSampler", } else: sampler_config = { "sampler_type": "SpecifiedProbMultiTaskSampler", "task_to_unweighted_probs": capped_num_examples_dict, } # === Build configuration === # # Finally, we build our big config dictionary. Congrats! config_dict = { "task_config_path_dict": task_config_path_dict, "task_cache_config_dict": task_cache_config_dict, "sampler_config": sampler_config, "global_train_config": { "max_steps": int(max_steps), "warmup_steps": int(max_steps * self.warmup_steps_proportion), }, "task_specific_configs_dict": { task_name: { "train_batch_size": self.train_batch_size, "eval_batch_size": eval_batch_size, "gradient_accumulation_steps": self.gradient_accumulation_steps, "eval_subset_num": self.eval_subset_num, } for task_name in full_task_name_list }, "taskmodels_config": { "task_to_taskmodel_map": {task_name: task_name for task_name in full_task_name_list}, "taskmodel_config_map": {task_name: None for task_name in full_task_name_list}, }, "task_run_config": { "train_task_list": task_name_list_dict["train"], "train_val_task_list": task_name_list_dict["train_val"], "val_task_list": task_name_list_dict["val"], "test_task_list": task_name_list_dict["test"], }, "metric_aggregator_config": { "metric_aggregator_type": "EqualMetricAggregator" }, } return config_dict
def generate_configs(args: RunConfiguration): xtreme_task = args.xtreme_task if xtreme_task == "mlqa": xtreme_task_name_list = [ f"{xtreme_task}_{lang}_{lang}" for lang in LANGS_DICT[xtreme_task] ] else: xtreme_task_name_list = [ f"{xtreme_task}_{lang}" for lang in LANGS_DICT[xtreme_task] ] if xtreme_task in TRAINED_TASKS: train_task = TRAIN_TASK_DICT[xtreme_task] train_task_name_list = [train_task] val_task_name_list = get_unique_list_in_order( [xtreme_task_name_list, train_task_name_list]) if args.early_stop_on_xtreme_tasks: train_val_task_name_list = val_task_name_list else: train_val_task_name_list = train_task_name_list elif xtreme_task in UNTRAINED_TASKS: train_task_name_list = [] val_task_name_list = xtreme_task_name_list train_val_task_name_list = [] else: raise KeyError(xtreme_task) if xtreme_task == "udpos": test_task_name_list = xtreme_task_name_list + [ f"udpos_{lang}" for lang in EXTRA_UDPOS_TEST_LANGS ] elif xtreme_task in ["xquad", "tydiqa", "tatoeba"]: test_task_name_list = [] else: test_task_name_list = xtreme_task_name_list if not args.suppress_print: print("Training on:", ", ".join(train_task_name_list)) print("Validation on:", ", ".join(val_task_name_list)) print("Early stopping on:", ", ".join(train_val_task_name_list)) print("Testing on:", ",".join(test_task_name_list)) config = configurator.SimpleAPIMultiTaskConfigurator( task_config_base_path=args.task_config_base_path, task_cache_base_path=args.task_cache_base_path, train_task_name_list=train_task_name_list, train_val_task_name_list=train_val_task_name_list, val_task_name_list=val_task_name_list, test_task_name_list=test_task_name_list, epochs=args.epochs, train_batch_size=args.train_batch_size, eval_batch_multiplier=args.eval_batch_multiplier, gradient_accumulation_steps=args.gradient_accumulation_steps, eval_subset_num=args.eval_subset_num, num_gpus=args.num_gpus, warmup_steps_proportion=args.warmup_steps_proportion, ).create_config() # Make sure all tasks use the same task head config["taskmodels_config"]["task_to_taskmodel_map"] = { k: xtreme_task for k, v in config["taskmodels_config"] ["task_to_taskmodel_map"].items() } if not args.suppress_print: print(f"Assigning all tasks to '{xtreme_task}' head") if xtreme_task in UNTRAINED_TASKS: # The reference implementation from the XTREME paper uses layer 14 for the # retrieval representation. config["taskmodels_config"]["taskmodel_config_map"] = { xtreme_task: { "pooler_type": "mean", "layer": args.retrieval_layer } } py_io.write_json(config, args.output_path)