def _create_examples(self, set_type): data = read_json(self.path_dict["data"]) metadata = read_json(self.path_dict["metadata"]) assert len(data) == len(metadata) examples = [] for data_row, metadata_row in zip(data, metadata): row_phase = self.DATA_PHASE_MAP[metadata_row["misc"][self.fold]] if row_phase != set_type: continue examples.append( Example( guid=data_row["pair-id"], text=data_row["context"], label=data_row["label"], ) ) return examples
def compute_metrics_from_accumulator( self, task, accumulator: BaseAccumulator, tokenizer, labels ) -> Metrics: # Todo: Fix val labels cache # This is a quick hack logits = accumulator.get_accumulated() partial_examples = squad_style.data_rows_to_partial_examples(data_rows=labels) all_pred_results = squad_style.logits_to_pred_results_list(logits) assert task.context_language == task.question_language lang = task.context_language predictions = squad_style_utils.compute_predictions_logits_v2( partial_examples=partial_examples, all_results=all_pred_results, n_best_size=task.n_best_size, max_answer_length=task.max_answer_length, do_lower_case=model_resolution.resolve_is_lower_case(tokenizer), version_2_with_negative=task.version_2_with_negative, null_score_diff_threshold=task.null_score_diff_threshold, tokenizer=tokenizer, skip_get_final_text=(lang == "zh"), verbose=True, ) dataset = read_json(task.val_path)["data"] results = mlqa_lib.evaluate(dataset=dataset, predictions=predictions, lang=lang,) return Metrics(major=(results["f1"] + results["exact_match"]) / 2, minor=results,)
def create_jiant_task_container_from_json( jiant_task_container_config_path: str, verbose: bool = True ) -> JiantTaskContainer: return create_jiant_task_container_from_dict( jiant_task_container_config_dict=py_io.read_json(jiant_task_container_config_path), verbose=verbose, )
def run_loop(args: RunConfiguration): quick_init_out = initialization.quick_init(args=args, verbose=True) with quick_init_out.log_writer.log_context(): if args.jiant_task_container_path: jiant_task_container = container_setup.create_jiant_task_container( **py_io.read_json(args.jiant_task_container_path) ) else: raise RuntimeError("Need `jiant_task_container_path` or individual config paths") runner = setup_runner( args=args, jiant_task_container=jiant_task_container, quick_init_out=quick_init_out, verbose=True, ) supertask, output_dir = args.supertask, args.output_dir if supertask in ["xnli", "pawsx"]: generate_and_write_preds_for_classification( runner=runner, supertask=supertask, output_dir=output_dir, skip_if_done=args.skip_if_done, ) elif supertask in ["udpos", "panx"]: generate_and_write_preds_for_tagging( runner=runner, supertask=supertask, output_dir=output_dir, skip_if_done=args.skip_if_done, ) elif supertask in ["xquad", "mlqa"]: generate_and_write_preds_for_qa( runner=runner, supertask=supertask, output_dir=output_dir, phase="test", skip_if_done=args.skip_if_done, ) elif supertask == "tydiqa": generate_and_write_preds_for_qa( runner=runner, supertask="tydiqa", output_dir=output_dir, phase="val", skip_if_done=args.skip_if_done, ) elif supertask == "bucc2018": generate_and_write_preds_for_bucc2018( runner=runner, output_dir=output_dir, bucc_val_metrics_path=args.bucc_val_metrics_path, skip_if_done=args.skip_if_done, ) elif supertask == "tatoeba": generate_and_write_preds_for_tatoeba( runner=runner, output_dir=output_dir, skip_if_done=args.skip_if_done, ) else: raise KeyError(supertask)
def main(): mode, cl_args = zconf.get_mode_and_cl_args() if mode == "json": args = JsonRunConfiguration.default_run_cli(cl_args=cl_args) config_dict = Registry.func_dict[args.func](**py_io.read_json(args.path)) write_configs( config_dict=config_dict, base_path=args.output_base_path, ) else: raise zconf.ModeLookupError(mode)
def generate_and_write_preds_for_bucc2018(runner, output_dir: str, bucc_val_metrics_path: str, skip_if_done: bool = False): """Generate predictions (test) for Bucc2018 and write them in XTREME submission format""" preds_pickle_path = os.path.join(output_dir, "bucc2018_test_preds.p") if skip_if_done and os.path.exists(preds_pickle_path): print(f"Skipping cause {preds_pickle_path} exists") return else: print(f"{preds_pickle_path} does not exist") if bucc_val_metrics_path is None: # Recompute thresholds: val_results_dict = runner.run_val( task_name_list=runner.jiant_task_container.task_run_config. val_task_list, return_preds=True, ) jiant_evaluate.write_preds( eval_results_dict=val_results_dict, path=os.path.join(output_dir, "bucc2018_val_preds.p"), ) thresholds_dict = { task_name: task_results["metrics"].minor["best-threshold"] for task_name, task_results in val_results_dict.items() } else: val_metrics = py_io.read_json(bucc_val_metrics_path) thresholds_dict = { task_name: val_metrics[task_name]["metrics"]["minor"]["best-threshold"] for task_name in runner.jiant_task_container.task_run_config.val_task_list } preds_output_dir = os.path.join(output_dir, "preds", "bucc2018") os.makedirs(preds_output_dir, exist_ok=True) test_results_dict = runner.run_test( task_name_list=runner.jiant_task_container.task_run_config. test_task_list, ) jiant_evaluate.write_preds( eval_results_dict=test_results_dict, path=preds_pickle_path, ) for task_name, task_results in test_results_dict.items(): bitext = bucc2018_lib.bucc_extract( cand2score=task_results["preds"], th=thresholds_dict[task_name], ) lang = runner.jiant_task_container.task_dict[task_name].language with open(os.path.join(preds_output_dir, f"test-{lang}.tsv"), "w") as f: for src, trg in bitext: f.write(f"{src}\t{trg}\n") print(f"Wrote Bucc2018 preds for {len(test_results_dict)} languages")
def create_and_write_task_config(task_name, task_data_dir, task_config_path): task_config_templates = py_io.read_json( py_filesystem.get_code_asset_path( "assets/simple_api/task_config_templates.json")) task_config = get_task_config( task_config_templates=task_config_templates, task_name=task_name, task_data_dir=task_data_dir, ) os.makedirs(os.path.split(task_config_path)[0], exist_ok=True) py_io.write_json(task_config, task_config_path)
def read_examples(self, path, set_type): input_data = read_json(path, encoding="utf-8")["data"] is_training = set_type == PHASE.TRAIN examples = [] data = take_one(input_data) for paragraph in maybe_tqdm(data["paragraphs"]): for qa in paragraph["qas"]: qas_id = qa["id"] # Because answers can also come from questions, we're going to abuse notation # slightly and put the entire background+situation+question into the "context" # and leave nothing for the "question" question_text = " " if self.include_background: context_segments = [ paragraph["background"], paragraph["situation"], qa["question"], ] else: context_segments = [paragraph["situation"], qa["question"]] full_context = " ".join(segment.strip() for segment in context_segments) if is_training: answer = qa["answers"][0] start_position_character = full_context.find( answer["text"]) answer_text = answer["text"] answers = [] else: start_position_character = None answer_text = None answers = [{ "text": answer["text"], "answer_start": full_context.find(answer["text"]) } for answer in qa["answers"]] example = Example( qas_id=qas_id, question_text=question_text, context_text=full_context, answer_text=answer_text, start_position_character=start_position_character, title="", is_impossible=False, answers=answers, background_text=paragraph["background"], situation_text=paragraph["situation"], ) examples.append(example) return examples
def test_export_model(tmp_path, model_type, model_class, hf_pretrained_model_name_or_path): export_model( hf_pretrained_model_name_or_path=hf_pretrained_model_name_or_path, output_base_path=tmp_path, ) read_config = py_io.read_json(os.path.join(tmp_path, f"config.json")) assert read_config["model_type"] == model_type assert read_config["model_path"] == os.path.join(tmp_path, "model", f"{model_type}.p") assert read_config["model_config_path"] == os.path.join( tmp_path, "model", f"{model_type}.json")
def run_from_parser_json_prepend(cls, parser, cl_args): parser.add_argument("--ZZsrc", type=str, action="append") parser.add_argument("--ZZoverrides", type=str, nargs="+") pre_args, _ = parser.parse_known_args(cl_args) if cl_args is None: cl_args = sys.argv[1:] if pre_args.ZZsrc is not None: # Import configs from ZZsrc JSONs imported_dict_ls = [read_json(path) for path in pre_args.ZZsrc] combined_imported_dict = combine_dicts(imported_dict_ls, strict=True) # Record which args are going to be overridden if pre_args.ZZoverrides is not None: raw_overrides = pre_args.ZZoverrides overrides = [f"--{k}" for k in raw_overrides] else: raw_overrides = overrides = [] attr_dict = cls.get_attr_dict() added_args = [] for k, v in combined_imported_dict.items(): formatted_k = f"--{k}" # Ensure that args from imported, which are not specified to be overridden, # aren't explicitly specified if formatted_k in cl_args and formatted_k not in overrides: raise RuntimeError(f"Attempting to override {formatted_k}") # Special handling for store_true args if cls._is_store_true_arg(attr_dict[k]): if v and k not in raw_overrides: added_args.append(formatted_k) else: added_args.append(formatted_k) added_args.append(str(v)) submitted_args = added_args + cl_args else: assert pre_args.ZZoverrides is None submitted_args = cl_args update_parser( parser=parser, class_with_attributes=cls, ) result, _ = read_parser( parser=parser, class_with_attributes=cls, skip_non_class_attributes=["ZZsrc", "ZZoverrides"], args=submitted_args, ) assert isinstance(result, cls) return result
def test_export_model(tmp_path, model_type, model_class, tokenizer_class, hf_model_name): export_model( model_type=model_type, output_base_path=tmp_path, model_class=model_class, tokenizer_class=tokenizer_class, hf_model_name=hf_model_name, ) read_config = py_io.read_json(os.path.join(tmp_path, f"config.json")) assert read_config["model_type"] == model_type assert read_config["model_path"] == os.path.join(tmp_path, "model", f"{model_type}.p") assert read_config["model_config_path"] == os.path.join(tmp_path, "model", f"{model_type}.json") assert read_config["model_tokenizer_path"] == os.path.join(tmp_path, "tokenizer")
def create_task_from_config_path(config_path: str, verbose: bool = False): """Creates task instance from task config filepath. Args: config_path (str): config filepath. verbose (bool): True if task config should be printed during task creation. Returns: Task instance. """ return create_task_from_config( read_json(config_path), base_path=os.path.split(config_path)[0], verbose=verbose, )
def test_simple_runscript(tmpdir, task_name, model_type): RUN_NAME = f"{test_simple_runscript.__name__}_{task_name}_{model_type}" data_dir = str(tmpdir.mkdir("data")) exp_dir = str(tmpdir.mkdir("exp")) downloader.download_data([task_name], data_dir) args = run.RunConfiguration( run_name=RUN_NAME, exp_dir=exp_dir, data_dir=data_dir, model_type=model_type, tasks=task_name, train_examples_cap=16, train_batch_size=16, no_cuda=True, ) run.run_simple(args) val_metrics = py_io.read_json(os.path.join(exp_dir, "runs", RUN_NAME, "val_metrics.json")) assert val_metrics["aggregated"] > 0
def run_simple(args: RunConfiguration, with_continue: bool = False): hf_config = AutoConfig.from_pretrained(args.hf_pretrained_model_name_or_path) model_cache_path = replace_none( args.model_cache_path, default=os.path.join(args.exp_dir, "models") ) with distributed.only_first_process(local_rank=args.local_rank): # === Step 1: Write task configs based on templates === # full_task_name_list = sorted(list(set(args.train_tasks + args.val_tasks + args.test_tasks))) task_config_path_dict = {} if args.create_config: task_config_path_dict = create_and_write_task_configs( task_name_list=full_task_name_list, data_dir=args.data_dir, task_config_base_path=os.path.join(args.data_dir, "configs"), ) else: for task_name in full_task_name_list: task_config_path_dict[task_name] = os.path.join( args.data_dir, "configs", f"{task_name}_config.json" ) # === Step 2: Download models === # # if not os.path.exists(os.path.join(model_cache_path, hf_config.model_type)): # print("Downloading model") # export_model.export_model( # hf_pretrained_model_name_or_path=args.hf_pretrained_model_name_or_path, # output_base_path=os.path.join(model_cache_path, hf_config.model_type), # ) # === Step 3: Tokenize and cache === # phase_task_dict = { "train": args.train_tasks, "val": args.val_tasks, "test": args.test_tasks, } for task_name in full_task_name_list: phases_to_do = [] for phase, phase_task_list in phase_task_dict.items(): if task_name in phase_task_list and not os.path.exists( os.path.join(args.exp_dir, "cache", hf_config.model_type, task_name, phase) ): config = read_json(task_config_path_dict[task_name]) if phase in config["paths"]: phases_to_do.append(phase) else: phase_task_list.remove(task_name) if not phases_to_do: continue print(f"Tokenizing Task '{task_name}' for phases '{','.join(phases_to_do)}'") tokenize_and_cache.main( tokenize_and_cache.RunConfiguration( task_config_path=task_config_path_dict[task_name], hf_pretrained_model_name_or_path=args.hf_pretrained_model_name_or_path, output_dir=os.path.join(args.exp_dir, "cache", hf_config.model_type, task_name), phases=phases_to_do, # TODO: Need a strategy for task-specific max_seq_length issues (issue #1176) max_seq_length=args.max_seq_length, smart_truncate=True, do_iter=True, ) ) # === Step 4: Generate jiant_task_container_config === # # We'll do this with a configurator. Creating a jiant_task_config has a surprising # number of moving parts. jiant_task_container_config = configurator.SimpleAPIMultiTaskConfigurator( task_config_base_path=os.path.join(args.data_dir, "configs"), task_cache_base_path=os.path.join(args.exp_dir, "cache", hf_config.model_type), train_task_name_list=args.train_tasks, val_task_name_list=args.val_tasks, test_task_name_list=args.test_tasks, train_batch_size=args.train_batch_size, eval_batch_multiplier=2, epochs=args.num_train_epochs, num_gpus=torch.cuda.device_count(), train_examples_cap=args.train_examples_cap, ).create_config() os.makedirs(os.path.join(args.exp_dir, "run_configs"), exist_ok=True) jiant_task_container_config_path = os.path.join( args.exp_dir, "run_configs", f"{args.run_name}_config.json" ) py_io.write_json(jiant_task_container_config, path=jiant_task_container_config_path) # === Step 5: Train/Eval! === # if args.model_weights_path: model_load_mode = "partial" model_weights_path = args.model_weights_path else: # From Transformers if any(task_name.startswith("mlm_") for task_name in full_task_name_list): model_load_mode = "from_transformers_with_mlm" else: model_load_mode = "from_transformers" model_weights_path = os.path.join( model_cache_path, hf_config.model_type, "model", "model.p" ) run_output_dir = os.path.join(args.exp_dir, "runs", args.run_name) if ( args.save_checkpoint_every_steps and os.path.exists(os.path.join(run_output_dir, "checkpoint.p")) and with_continue ): print("Resuming") checkpoint = torch.load(os.path.join(run_output_dir, "checkpoint.p")) run_args = runscript.RunConfiguration.from_dict(checkpoint["metadata"]["args"]) else: print("Running from start") run_args = runscript.RunConfiguration( # === Required parameters === # jiant_task_container_config_path=jiant_task_container_config_path, output_dir=run_output_dir, # === Model parameters === # hf_pretrained_model_name_or_path=args.hf_pretrained_model_name_or_path, model_path=model_weights_path, model_config_path=os.path.join( model_cache_path, hf_config.model_type, "model", "config.json", ), model_load_mode=model_load_mode, # === Running Setup === # do_train=bool(args.train_tasks), do_val=bool(args.val_tasks), do_save=args.do_save, do_save_best=args.do_save_best, do_save_last=args.do_save_last, write_val_preds=args.write_val_preds, write_test_preds=args.write_test_preds, eval_every_steps=args.eval_every_steps, save_every_steps=args.save_every_steps, save_checkpoint_every_steps=args.save_checkpoint_every_steps, no_improvements_for_n_evals=args.no_improvements_for_n_evals, keep_checkpoint_when_done=args.keep_checkpoint_when_done, force_overwrite=args.force_overwrite, seed=args.seed, # === Training Learning Parameters === # learning_rate=args.learning_rate, adam_epsilon=args.adam_epsilon, max_grad_norm=args.max_grad_norm, optimizer_type=args.optimizer_type, # === Specialized config === # no_cuda=args.no_cuda, fp16=args.fp16, fp16_opt_level=args.fp16_opt_level, local_rank=args.local_rank, server_ip=args.server_ip, server_port=args.server_port, ) checkpoint = None runscript.run_loop(args=run_args, checkpoint=checkpoint) py_io.write_file(args.to_json(), os.path.join(run_output_dir, "simple_run_config.json"))
def simple_multi_task_config( task_meta_config_dict, task_cache_dict, task_name_list=None, epochs=None, max_steps=None, num_gpus=1, train_examples_cap=None, warmup_steps_proportion=0.1, ): if isinstance(task_meta_config_dict, str): task_meta_config_dict = py_io.read_json(os.path.expandvars(task_meta_config_dict)) if isinstance(task_cache_dict, str): task_cache_dict = py_io.read_json(os.path.expandvars(task_cache_dict)) if task_name_list is None: task_name_list = sorted(list(task_meta_config_dict)) assert (epochs is None) != (max_steps is None) # Proportional num_examples_dict = {} capped_num_examples_dict = {} max_steps_not_given = max_steps is None print(max_steps_not_given) if max_steps_not_given: assert isinstance(epochs, (int, float)) max_steps = 0 for task_name in task_name_list: effective_batch_size = ( task_meta_config_dict[task_name]["train_batch_size"] * task_meta_config_dict[task_name]["gradient_accumulation_steps"] * num_gpus ) num_examples = get_num_examples_from_cache( cache_path=os.path.expandvars(task_cache_dict[task_name]["train"]), ) capped_num_examples = cap_examples(num_examples=num_examples, cap=train_examples_cap) num_examples_dict[task_name] = num_examples capped_num_examples_dict[task_name] = capped_num_examples if max_steps_not_given: max_steps += num_examples * epochs // effective_batch_size if train_examples_cap is None: sampler_config = { "sampler_type": "ProportionalMultiTaskSampler", } else: sampler_config = { "sampler_type": "SpecifiedProbMultiTaskSampler", "task_to_unweighted_probs": capped_num_examples_dict, } config_dict = { "task_config_path_dict": { task_name: os.path.expandvars(task_meta_config_dict[task_name]["config_path"]) for task_name in task_name_list }, "task_cache_config_dict": { task_name: { "train": os.path.expandvars(task_cache_dict[task_name]["train"]), "val": os.path.expandvars(task_cache_dict[task_name]["val"]), "val_labels": os.path.expandvars(task_cache_dict[task_name]["val_labels"]), } for task_name in task_name_list }, "sampler_config": sampler_config, "global_train_config": { "max_steps": max_steps, "warmup_steps": int(max_steps * warmup_steps_proportion), }, "task_specific_configs_dict": { task_name: { "train_batch_size": task_meta_config_dict[task_name]["train_batch_size"], "eval_batch_size": task_meta_config_dict[task_name]["eval_batch_size"], "gradient_accumulation_steps": task_meta_config_dict[task_name][ "gradient_accumulation_steps" ], "eval_subset_num": task_meta_config_dict[task_name]["eval_subset_num"], } for task_name in task_name_list }, "taskmodels_config": { "task_to_taskmodel_map": { task_name: task_meta_config_dict[task_name]["task_to_taskmodel_map"] for task_name in task_name_list }, "taskmodel_config_map": {task_name: None for task_name in task_name_list}, }, "task_run_config": { "train_task_list": task_name_list, "train_val_task_list": task_name_list, "val_task_list": task_name_list, "test_task_list": task_name_list, }, "metric_aggregator_config": {"metric_aggregator_type": "EqualMetricAggregator"}, } return config_dict
def create_config(self): # === Gather task names === # # Get the full list of tasks across all phases task_name_list_dict = { "train": self.parse_task_name_list(self.train_task_name_list), "val": self.parse_task_name_list(self.val_task_name_list), "test": self.parse_task_name_list(self.test_task_name_list), } if self.train_val_task_name_list is None: task_name_list_dict["train_val"] = task_name_list_dict["train"] else: task_name_list_dict["train_val"] = self.parse_task_name_list( self.train_val_task_name_list) full_task_name_list = py_datastructures.get_unique_list_in_order( task_name_list_dict.values()) # === Gather task configs === # # Build task_config_path_dict, either via # 1. task_config_base_path: where all caches are contained within a given folder # 2. task_config_dict: explicitly provided dictionary to cache paths, potentially in JSON # Use dictionary directly, or load from JSON if self.task_config_base_path is not None: assert self.task_config_path_dict is None task_config_path_dict = { task_name: os.path.join(self.task_config_base_path, f"{task_name}_config.json") for task_name in full_task_name_list } else: if isinstance(self.task_config_path_dict, str): task_config_path_dict = py_io.read_json( os.path.expandvars(self.task_config_path_dict)) else: task_config_path_dict = self.task_config_path_dict # === Gather cache === # # Build task_cache_base_path, either via # 1. task_cache_base_path: where all caches are contained within a given folder # 2. task_cache_config_dict: explicitly provided dictionary to cache paths, # potentially in JSON if self.task_cache_base_path is not None: assert self.task_cache_config_dict is None task_cache_config_dict = {} for task_name in full_task_name_list: task_cache_config_dict[task_name] = {} if task_name in task_name_list_dict["train"]: task_cache_config_dict[task_name]["train"] = os.path.join( self.task_cache_base_path, task_name, "train", ) if (task_name in task_name_list_dict["train_val"] or task_name in task_name_list_dict["val"]): task_cache_config_dict[task_name]["val"] = os.path.join( self.task_cache_base_path, task_name, "val", ) task_cache_config_dict[task_name][ "val_labels"] = os.path.join( self.task_cache_base_path, task_name, "val_labels", ) if task_name in task_name_list_dict["test"]: task_cache_config_dict[task_name]["test"] = os.path.join( self.task_cache_base_path, task_name, "test", ) elif isinstance(self.task_cache_config_dict, str): assert self.task_cache_base_path is None task_cache_config_dict = py_io.read_json( self.task_cache_config_dict) elif isinstance(task_config_path_dict, dict): task_cache_config_dict = self.task_cache_config_dict else: raise RuntimeError( "Need 'task_cache_base_path' or 'task_cache_dict'") # === Compute training steps === # # Computing the number of training steps across multiple tasks is slightly # trickier than expected (unless max_steps is explicitly provided) # We need to get the number of examples for each task, divide by the # effective batch size (batch size per gpu * grad accum steps * number of gpus) # AND consider a common use-case where we cap the number of examples from a given task assert (self.epochs is None) != ( self.max_steps is None), "Specify only 'epochs' or 'max_steps'" num_examples_dict = {} capped_num_examples_dict = {} max_steps_not_given = self.max_steps is None if max_steps_not_given: assert isinstance(self.epochs, (int, float)) max_steps = 0 else: max_steps = self.max_steps for task_name in task_name_list_dict["train"]: if self.num_gpus: # We multiply by num_gpus because 1 step is done across (potentially) multiple GPUs effective_batch_size = (self.train_batch_size * self.gradient_accumulation_steps * self.num_gpus) else: effective_batch_size = self.train_batch_size * self.gradient_accumulation_steps num_examples = get_num_examples_from_cache( cache_path=os.path.expandvars( task_cache_config_dict[task_name]["train"]), ) capped_num_examples = cap_examples(num_examples=num_examples, cap=self.train_examples_cap) num_examples_dict[task_name] = num_examples capped_num_examples_dict[task_name] = capped_num_examples if max_steps_not_given: max_steps += self.epochs * math.ceil( capped_num_examples / effective_batch_size) # === Compute eval_batch_size === # # Eval batch size is often a multiple of train batch size, # so we provide 2 ways to specify it assert (self.eval_batch_size is None) != ( self.eval_batch_multiplier is None), "Specify only 'eval_batch_size' or 'eval_batch_multiplier'" if self.eval_batch_multiplier is not None: eval_batch_size = self.train_batch_size * self.eval_batch_multiplier else: eval_batch_size = self.eval_batch_size # === Configure Sampler === # # We sample proportionally by default, unless our training examples are capped per task if self.train_examples_cap is None: sampler_config = { "sampler_type": "ProportionalMultiTaskSampler", } else: sampler_config = { "sampler_type": "SpecifiedProbMultiTaskSampler", "task_to_unweighted_probs": capped_num_examples_dict, } # === Build configuration === # # Finally, we build our big config dictionary. Congrats! config_dict = { "task_config_path_dict": task_config_path_dict, "task_cache_config_dict": task_cache_config_dict, "sampler_config": sampler_config, "global_train_config": { "max_steps": int(max_steps), "warmup_steps": int(max_steps * self.warmup_steps_proportion), }, "task_specific_configs_dict": { task_name: { "train_batch_size": self.train_batch_size, "eval_batch_size": eval_batch_size, "gradient_accumulation_steps": self.gradient_accumulation_steps, "eval_subset_num": self.eval_subset_num, } for task_name in full_task_name_list }, "taskmodels_config": { "task_to_taskmodel_map": {task_name: task_name for task_name in full_task_name_list}, "taskmodel_config_map": {task_name: None for task_name in full_task_name_list}, }, "task_run_config": { "train_task_list": task_name_list_dict["train"], "train_val_task_list": task_name_list_dict["train_val"], "val_task_list": task_name_list_dict["val"], "test_task_list": task_name_list_dict["test"], }, "metric_aggregator_config": { "metric_aggregator_type": "EqualMetricAggregator" }, } return config_dict
def write_configs_from_full(full_config_path): write_configs( config_dict=py_io.read_json(full_config_path), base_path=os.path.split(full_config_path)[0], )
def setup_runner( args: RunConfiguration, jiant_task_container: container_setup.JiantTaskContainer, quick_init_out, verbose: bool = True, ) -> jiant_runner.JiantRunner: """Setup jiant model, optimizer, and runner, and return runner. Args: args (RunConfiguration): configuration carrying command line args specifying run params. jiant_task_container (container_setup.JiantTaskContainer): task and sampler configs. quick_init_out (QuickInitContainer): device (GPU/CPU) and logging configuration. verbose: If True, enables printing configuration info (to standard out). Returns: jiant_runner.JiantRunner """ # TODO document why the distributed.only_first_process() context manager is being used here. jiant_model = jiant_model_setup.setup_jiant_model( model_type=args.model_type, model_config_path=args.model_config_path, tokenizer_path=args.model_tokenizer_path, task_dict=jiant_task_container.task_dict, taskmodels_config=jiant_task_container.taskmodels_config, ) weights_dict = torch.load(args.model_path) jiant_model_setup.load_encoder_from_transformers_weights( encoder=jiant_model.encoder, weights_dict=weights_dict, ) if args.adapter_config_path: adapter_config = adapters_modeling.AdapterConfig.from_dict( py_io.read_json(args.adapter_config_path), ) else: adapter_config = adapters_modeling.AdapterConfig() adapters_modeling.add_shared_adapters_to_jiant_model( jiant_model=jiant_model, adapter_config=adapter_config, ) if args.adapters_load_mode and args.adapters_load_path: adapters_modeling.delegate_load_for_shared_adapters( jiant_model=jiant_model, state_dict=torch.load(args.adapters_load_path), load_mode=args.adapters_load_mode, ) jiant_model.to(quick_init_out.device) ( optimized_named_parameters, _, ) = adapters_modeling.get_optimized_named_parameters_for_jiant_model_with_adapters( jiant_model=jiant_model, ) optimizer_scheduler = model_setup.create_optimizer_from_params( named_parameters=optimized_named_parameters, learning_rate=args.learning_rate, t_total=jiant_task_container.global_train_config.max_steps, warmup_steps=jiant_task_container.global_train_config.warmup_steps, warmup_proportion=None, verbose=verbose, ) jiant_model, optimizer = model_setup.raw_special_model_setup( model=jiant_model, optimizer=optimizer_scheduler.optimizer, fp16=args.fp16, fp16_opt_level=args.fp16_opt_level, n_gpu=quick_init_out.n_gpu, local_rank=args.local_rank, ) optimizer_scheduler.optimizer = optimizer rparams = jiant_runner.RunnerParameters( local_rank=args.local_rank, n_gpu=quick_init_out.n_gpu, fp16=args.fp16, max_grad_norm=args.max_grad_norm, ) runner = jiant_runner.JiantRunner( jiant_task_container=jiant_task_container, jiant_model=jiant_model, optimizer_scheduler=optimizer_scheduler, device=quick_init_out.device, rparams=rparams, log_writer=quick_init_out.log_writer, ) return runner
def download_tydiqa_data_and_write_config(task_data_base_path: str, task_config_base_path: str): tydiqa_temp_path = py_io.create_dir(task_data_base_path, "tydiqa_temp") full_train_path = os.path.join(tydiqa_temp_path, "tydiqa-goldp-v1.1-train.json") download_utils.download_file( "https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-train.json", full_train_path, ) download_utils.download_and_untar( "https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-dev.tgz", tydiqa_temp_path, ) languages_dict = { "arabic": "ar", "bengali": "bn", "english": "en", "finnish": "fi", "indonesian": "id", "korean": "ko", "russian": "ru", "swahili": "sw", "telugu": "te", } # Split train data data = py_io.read_json(full_train_path) lang2data = {lang: [] for lang in languages_dict.values()} for doc in data["data"]: for par in doc["paragraphs"]: context = par["context"] for qa in par["qas"]: question = qa["question"] question_id = qa["id"] example_lang = languages_dict[question_id.split("-")[0]] q_id = question_id.split("-")[-1] for answer in qa["answers"]: a_start, a_text = answer["answer_start"], answer["text"] a_end = a_start + len(a_text) assert context[a_start:a_end] == a_text lang2data[example_lang].append({ "paragraphs": [{ "context": context, "qas": [{ "answers": qa["answers"], "question": question, "id": q_id }], }] }) for full_lang, lang in languages_dict.items(): task_name = f"tydiqa_{lang}" task_data_path = py_io.create_dir(task_data_base_path, task_name) train_path = os.path.join(task_data_path, f"tydiqa.{lang}.train.json") py_io.write_json( data=data, path=train_path, skip_if_exists=True, ) val_path = os.path.join(task_data_path, f"tydiqa.{lang}.dev.json") os.rename( src=os.path.join(tydiqa_temp_path, "tydiqa-goldp-v1.1-dev", f"tydiqa-goldp-dev-{full_lang}.json"), dst=val_path, ) py_io.write_json( data={ "task": "tydiqa", "paths": { "train": train_path, "val": val_path }, "kwargs": { "language": lang }, "name": task_name, }, path=os.path.join(task_config_base_path, f"{task_name}_config.json"), skip_if_exists=True, ) shutil.rmtree(tydiqa_temp_path)
def get_tags_to_id(self): tags_to_id = read_json(self.path_dict["tags_to_id"]) tags_to_id = {k: int(v) for k, v in tags_to_id.items()} return tags_to_id
def single_task_config(task_config_path, train_batch_size=None, task_cache_base_path=None, task_cache_train_path=None, task_cache_val_path=None, task_cache_val_labels_path=None, epochs=None, max_steps=None, eval_batch_multiplier=2, eval_batch_size=None, gradient_accumulation_steps=1, eval_subset_num=500, warmup_steps_proportion=0.1, phases=("train", "val")): task_config = py_io.read_json(os.path.expandvars(task_config_path)) task_name = task_config["name"] do_train = "train" in phases do_val = "val" in phases cache_path_dict = {} if do_train: if task_cache_train_path is None: task_cache_train_path = os.path.join(task_cache_base_path, "train") cache_path_dict["train"] = os.path.expandvars(task_cache_train_path) if do_val: if task_cache_val_path is None: task_cache_val_path = os.path.join(task_cache_base_path, "val") if task_cache_val_labels_path is None: task_cache_val_labels_path = os.path.join(task_cache_base_path, "val_labels") cache_path_dict["val"] = os.path.expandvars(task_cache_val_path) cache_path_dict["val_labels"] = os.path.expandvars( task_cache_val_labels_path) if do_train: assert (epochs is None) != (max_steps is None) assert train_batch_size is not None effective_batch_size = train_batch_size * gradient_accumulation_steps num_training_examples = get_num_examples_from_cache( cache_path=os.path.expandvars(task_cache_train_path), ) max_steps = num_training_examples * epochs // effective_batch_size else: max_steps = 0 train_batch_size = 0 if do_val: if eval_batch_size is None: assert train_batch_size is not None eval_batch_size = train_batch_size * eval_batch_multiplier config_dict = { "task_config_path_dict": { task_name: os.path.expandvars(task_config_path), }, "task_cache_config_dict": { task_name: cache_path_dict, }, "sampler_config": { "sampler_type": "UniformMultiTaskSampler", }, "global_train_config": { "max_steps": max_steps, "warmup_steps": int(max_steps * warmup_steps_proportion), }, "task_specific_configs_dict": { task_name: { "train_batch_size": train_batch_size, "eval_batch_size": eval_batch_size, "gradient_accumulation_steps": gradient_accumulation_steps, "eval_subset_num": eval_subset_num, }, }, "taskmodels_config": { "task_to_taskmodel_map": { task_name: task_name, }, "taskmodel_config_map": { task_name: None, } }, "task_run_config": { "train_task_list": [task_name] if do_train else [], "train_val_task_list": [task_name] if do_train else [], "val_task_list": [task_name] if do_val else [], "test_task_list": [], }, "metric_aggregator_config": { "metric_aggregator_type": "EqualMetricAggregator", }, } return config_dict