def show_experiment_config(self) -> None: """ print experiment config ----------------------- :used attr: experiment_name: [str], e.g. 'exp0' or 'all' """ from nerblackbox.modules.utils.env_variable import env_variable if self.experiment_name != "all": path_experiment_config = join( env_variable("DIR_EXPERIMENT_CONFIGS"), f"{self.experiment_name}.ini") if isfile(path_experiment_config): with open(path_experiment_config, "r") as file: lines = file.read() print(f"> experiment_config = {path_experiment_config}") print() print(lines) else: print( f"> experiment_config = {path_experiment_config} does not exist." ) else: experiment_configs = glob.glob( join(env_variable("DIR_EXPERIMENT_CONFIGS"), "*.ini")) experiment_configs = [ elem.split("/")[-1].strip(".ini") for elem in experiment_configs ] experiment_configs = [ elem for elem in experiment_configs if elem != "default" ] for experiment_config in experiment_configs: print(experiment_config)
def get_dataset_path(dataset: str, subset: str = "") -> str: """ get dataset path for dataset Args: dataset: e.g. 'suc', 'swedish_ner_corpus' subset: e.g. 'original_cased' Returns: dataset_path: path to dataset directory """ if len(subset): return join(env_variable("DIR_DATASETS"), dataset, subset) else: return join(env_variable("DIR_DATASETS"), dataset)
def extract_best_single_run(self) -> None: if self.experiment is not None and self.single_runs is not None: _df_best_single_run = self.single_runs.iloc[0, :] assert ( self.name is not None ), f"ERROR! self.name is None, extract_best_single_run() failed." checkpoint = join( env_variable("DIR_CHECKPOINTS"), self.name, _df_best_single_run[("info", "run_name_nr")], epoch2checkpoint(_df_best_single_run[("metrics", "EPOCH_BEST")]), ) fields_info = ["run_id", "run_name_nr"] self.best_single_run = dict( **{ "exp_id": self._id, "exp_name": self.name, "checkpoint": checkpoint if isfile(checkpoint) else None, }, **{ field: _df_best_single_run[("info", field)] for field in fields_info }, **{ field: _df_best_single_run[("metrics", field)] for field in self.METRICS_PLUS.values() }, ) else: self.best_single_run = dict()
def _get_config( self, experiment_name: str ) -> Tuple[Dict[str, Dict[str, str]], List[str]]: """ get ConfigParser instance and derive config dictionary from it Args: experiment_name: e.g. 'exp1', 'default Returns: _config_dict: w/ keys = sections [str], values = [dict] w/ key: value = params: values _run_names: e.g. ["runA", "runB"] """ config_path = join( env_variable("DIR_EXPERIMENT_CONFIGS"), f"{experiment_name}.ini" ) if not os.path.isfile(config_path): raise Exception(f"config file at {config_path} does not exist") _config = ConfigParser() _config.read(config_path) _config_dict: Dict[str, Dict[str, Any]] = { s: dict(_config.items(s)) for s in _config.sections() } # {'hparams': {'monitor': 'val_loss'}} _config_dict = { s: {k: self._convert(k, v) for k, v in subdict.items()} for s, subdict in _config_dict.items() } # combine sections 'dataset', 'model' & 'settings' to single section 'params' _config_dict["params"] = dict() for s in ["dataset", "model", "settings"]: if s in _config_dict.keys(): _config_dict["params"].update(_config_dict[s]) _config_dict.pop(s) # derive uncased if ( "uncased" not in _config_dict["params"].keys() and "pretrained_model_name" in _config_dict["params"] ): if "uncased" in _config_dict["params"]["pretrained_model_name"]: _config_dict["params"]["uncased"] = True elif "cased" in _config_dict["params"]["pretrained_model_name"]: _config_dict["params"]["uncased"] = False else: _config_dict["params"]["uncased"] = False print( "ATTENTION! could not derive uncased = True/False from pretrained_model_name." " => assume model is cased" ) _run_names = [ run_name for run_name in _config.sections() if run_name.startswith("run") ] return _config_dict, _run_names
def assert_that_experiment_hasnt_been_run_before(experiment_name: str) -> None: """ Args: experiment_name: e.g. 'my_experiment' """ experiment_directory = join(env_variable("DIR_CHECKPOINTS"), experiment_name) if isdir(experiment_directory): raise Exception( f"ERROR! experiment = {experiment_name} has been run before ({experiment_directory} exists)" )
def get_available_datasets() -> List[str]: """ get datasets that are available in DIR_DATASETS directory Returns: available datasets: e.g. ['suc', 'swedish_ner_corpus'] """ dir_datasets = env_variable("DIR_DATASETS") return [ folder for folder in os.listdir(dir_datasets) if os.path.isdir(join(dir_datasets, folder)) ]
def _parse_args(_parser, _args): """ :param _parser: [argparse ArgumentParser] :param _args: [argparse arguments] :return _params: [argparse.Namespace] attr: experiment_name, run_name, device, fp16 :return _log_dirs: [argparse.Namespace] attr: mlflow, tensorboard """ # parsing _params = None for group in _parser._action_groups: group_dict = { a.dest: getattr(_args, a.dest, None) for a in group._group_actions } if group.title == "args_general": group_dict["device"] = torch.device( "cuda" if torch.cuda.is_available() and group_dict["device"] == "gpu" else "cpu") group_dict["fp16"] = (True if group_dict["fp16"] and group_dict["device"].type == "cuda" else False) group_dict["from_config"] = bool(group_dict["from_config"]) if len(group_dict["run_name"]) == 0: group_dict["run_name"] = None _params = argparse.Namespace(**group_dict) # log_dirs _log_dirs_dict = { "mlflow": env_variable("DIR_MLFLOW"), "tensorboard": env_variable("DIR_TENSORBOARD"), "checkpoints": env_variable("DIR_CHECKPOINTS"), "log_file": env_variable("LOG_FILE"), "mlflow_file": env_variable("MLFLOW_FILE"), } _log_dirs = argparse.Namespace(**_log_dirs_dict) return _params, _log_dirs
def clear_data(self) -> None: """ :used attr: clear_all [bool] if True, clear not only checkpoints but also mlflow, tensorboard and logs """ data_dir = env_variable("DATA_DIR") results_dir = join(data_dir, "results") assert isdir(results_dir), f"directory {results_dir} does not exist." # checkpoints objects_to_remove = glob.glob(join(results_dir, "checkpoints", "*")) # list # results (mlflow, tensorboard, ..) if self.results: results_files = (glob.glob(join(results_dir, "mlruns", "*")) + glob.glob(join(results_dir, "mlruns", ".*")) + glob.glob(join(results_dir, "tensorboard", "*")) + glob.glob(join(results_dir, "logs.log")) + glob.glob(join(results_dir, "*.npy"))) objects_to_remove.extend(results_files) if len(objects_to_remove) == 0: print(f"There is no data to remove in {results_dir}") else: for elem in objects_to_remove: print(elem) while 1: answer = input("Do you want to remove the above files? (y/n) ") if answer == "y": for elem in objects_to_remove: if isfile(elem): os.remove(elem) elif isdir(elem): shutil.rmtree(elem, ignore_errors=False) else: raise ValueError( f"object {elem} is neither a file nor a dir and cannot be removed" ) print(f"Files removed") break elif answer == "n": print(f"No files removed") break else: print("Please enter either y or n")
def _create_data_directory(self) -> None: if resource_isdir(Requirement.parse("nerblackbox"), "nerblackbox/modules/data"): data_source = resource_filename(Requirement.parse("nerblackbox"), "nerblackbox/modules/data") data_dir = env_variable("DATA_DIR") if self.verbose: print("data_source =", data_source) print("data_target =", data_dir) if os.path.isdir(data_dir): print(f"init: target {data_dir} already exists") else: shutil.copytree(data_source, data_dir) print(f"init: target {data_dir} created") else: print("init not executed successfully") exit(0)
def _write_config_file(self) -> None: """ write config file based on self.hparams """ # assert that config file does not exist config_path = join(env_variable("DIR_EXPERIMENT_CONFIGS"), f"{self.experiment_name}.ini") assert ( isfile(config_path) is False ), f"ERROR! experiment config file {config_path} already exists!" # write config file: helper functions def _write(_str: str): f.write(_str + "\n") def _write_key_value(_key: str): assert ( self.hparams is not None ), f"ERROR! self.hparams is None - _write_key_value() failed." if _key in self.hparams.keys(): f.write(f"{_key} = {self.hparams[_key]}\n") # write config file with open(config_path, "w") as f: _write("[dataset]") for key in DATASET.keys(): _write_key_value(key) _write("\n[model]") for key in MODEL.keys(): _write_key_value(key) _write("\n[settings]") for key in SETTINGS.keys(): _write_key_value(key) _write("\n[hparams]") for key in HPARAMS.keys(): _write_key_value(key) _write("\n[runA]")
def _get_model_checkpoint_directory(_params): """ :param _params: [argparse.Namespace] attr: experiment_name, run_name, pretrained_model_name, dataset_name, .. :return: model_checkpoint_directory [str] """ return join(env_variable("DIR_CHECKPOINTS"), _params.experiment_run_name_nr)
def __init__( self, flag: str, usage: str = "cli", dataset_name: Optional[ str] = None, # analyze_data & set_up_dataset dataset_subset_name: Optional[str] = None, # set_up_dataset modify: bool = True, # set_up_dataset val_fraction: float = 0.3, # set_up_dataset verbose: bool = False, experiment_name: Optional[str] = None, hparams: Optional[Dict[str, Union[str, int, bool]]] = None, # run_experiment from_preset: Optional[str] = None, # run_experiment from_config: bool = False, # run_experiment run_name: Optional[str] = None, # run_experiment device: str = "gpu", # run_experiment fp16: bool = False, # run_experiment text_input: Optional[str] = None, # predict ids: Tuple[str, ...] = (), # get_experiments, get_experiments_results as_df: bool = True, # get_experiments, get_experiments_results results: bool = False, # clear_data ): """ :param flag: [str], e.g. 'analyze_data', 'set_up_dataset', 'run_experiment', .. :param usage: [str] 'cli' or 'api' :param dataset_name: [str] e.g. 'swedish_ner_corpus' :param dataset_subset_name: [str] e.g. 'simple_cased' :param modify: [bool] if True: modify tags as specified in method modify_ner_tag_mapping() :param val_fraction: [float] e.g. 0.3 :param verbose: [bool] :param experiment_name: [str], e.g. 'exp0' :param hparams: [dict], e.g. {'multiple_runs': '2'} with hparams to use [HIERARCHY: I] :param from_preset: [str], e.g. 'adaptive' get experiment params & hparams from preset [HIERARCHY: II] :param from_config: [bool] if True, get experiment params & hparams from config file [ALTERNATIVE] :param run_name: [str or None], e.g. 'runA' :param device: [str] :param fp16: [bool] :param text_input: [str], e.g. 'this is some text that needs to be annotated' :param ids: [tuple of str], experiment_ids to include :param as_df: [bool] if True, return pandas DataFrame, else return dict :param results: [bool] if True, clear not only checkpoints but also mlflow, tensorboard and logs """ self._assert_flag(flag) os.environ["MLFLOW_TRACKING_URI"] = env_variable("DIR_MLFLOW") self.flag = flag self.usage = usage self.dataset_name = dataset_name # analyze_data & set_up_dataset self.dataset_subset_name = dataset_subset_name # set_up_dataset self.modify = modify # set_up_dataset self.val_fraction = val_fraction # set_up_dataset self.verbose = verbose self.experiment_name = experiment_name self.hparams: Optional[Dict[str, Union[str, int, bool]]] = self._process_hparams( hparams, from_preset) self.from_config: bool = from_config self.run_name = run_name # run_experiment self.device = device # run_experiment self.fp16 = fp16 # run_experiment self.text_input = text_input # predict self.ids = ids # get_experiments, get_experiments_results self.as_df = as_df # get_experiments, get_experiments_results self.results = results # clear_data if self.flag == "run_experiment": assert (self.hparams is None and self.from_config is True) or ( self.hparams is not None and self.from_config is False), ( f"ERROR! Need to specify " f"EITHER hparams (currently {self.hparams}) " f"with or without from_preset (currently {from_preset}) " f"OR from_config (currently {self.from_config}).") if self.from_config: path_experiment_config = join( env_variable("DIR_EXPERIMENT_CONFIGS"), f"{self.experiment_name}.ini", ) if not isfile(path_experiment_config): self._exit_gracefully( f"experiment_config = {path_experiment_config} does not exist." ) else: assert ( self.hparams is not None ), f"ERROR! self.hparams is None but needs to be specified if dynamic arguments are used." for field in ["pretrained_model_name", "dataset_name"]: if field not in self.hparams.keys(): field_displayed = ("model" if field == "pretrained_model_name" else "dataset") self._exit_gracefully( f"{field_displayed} is not specified but mandatory if dynamic arguments are used." ) data_dir = env_variable("DATA_DIR") if os.path.isdir(data_dir): self._set_client_and_get_experiments() else: # will be set in init() method self.client = None self.experiment_id2name = None self.experiment_name2id = None