def get_next_parameter(socket): """ Get the hyper paremeters generated by tuner. For a multiphase experiment, it returns a new group of hyper parameters at each call of get_next_parameter. For a non-multiphase (multiPhase is not configured or set to False) experiment, it returns hyper parameters only on the first call for each trial job, it returns None since second call. This API should be called only once in each trial job of an experiment which is not specified as multiphase. Returns ------- dict A dict object contains the hyper parameters generated by tuner, the keys of the dict are defined in search space. Returns None if no more hyper parameters can be generated by tuner. """ global _params #_params = platform.get_next_parameter() # v1.1 father_id = -1 start=time.time() _params = platform.get_next_parameter() end=time.time() if _params is None: return None socket.send_pyobj({"type": "get_next_parameter"}) message = socket.recv_pyobj() tuner = message["tuner"] if tuner.history: p0 = multiprocessing.Process(target= tuner.generate_parameters, args=(int(get_sequence_id()),)) p0.start() trial_concurrency = os.popen('cat /etc/slurm-llnl/slurm.conf|grep NodeName|wc -l') trial_concurrency = int(trial_concurrency.read().strip()) if get_sequence_id() < trial_concurrency : lock.acquire() with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/graph.txt", "a+") as f: json_and_id = 'json_out=' + str(_params['parameters']) + '+history' + "=False or True?" f.write(json_and_id + "\n") lock.release() else: socket.send_pyobj({"type": "generated_parameter"}) message = socket.recv_pyobj() lock.acquire() with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/trials/" + str(nni.get_trial_id()) + "/output.log","a+") as f: f.write(" generate=" + str(end-start)+"\n") with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/graph.txt","a+") as f: json_and_id='json_out='+str(_params['parameters'])+'+history'+"=False" f.write(json_and_id+"\n") lock.release()
def output_experiment_detail(self, res_path): exp = nni.get_experiment_id() trail = nni.get_trial_id() trail_path = os.path.join(res_path, exp, trail) if not os.path.exists(os.path.join(res_path, exp)): os.mkdir(os.path.join(res_path, exp)) if not os.path.exists(trail_path): os.mkdir(trail_path) # TODO whatever you want p_loss = self.plot_line(LOSS_PLOT, show_plot=False) p_acc = self.plot_line(ACCURACY_PLOT, show_plot=False) p_auc = self.plot_line(AUC_PLOT, show_plot=False) measures_table = [ ["train_loss_vec"] + [str(x) for x in self.loss_train_vec], ["train_acc_vec"] + [str(x) for x in self.accuracy_train_vec], ["train_auc_vec"] + [str(x) for x in self.auc_train_vec], ["dev_loss_vec"] + [str(x) for x in self.loss_dev_vec], ["dev_acc_vec"] + [str(x) for x in self.accuracy_dev_vec], ["dev_auc_vec"] + [str(x) for x in self.auc_dev_vec], ["test_loss_vec"] + [str(x) for x in self.loss_test_vec], ["test_acc_vec"] + [str(x) for x in self.accuracy_test_vec], ["test_auc_vec"] + [str(x) for x in self.auc_test_vec] ] with open(os.path.join(res_path, exp, trail, "measures_by_epochs.csv", "wt"), newline="") as f: writer = csv.writer(f) writer.writerows(measures_table)
def __init__(self, model_key, run_id, run_dir): self.model_key = model_key self.run_id = run_id self.run_dir = run_dir self.trial_id = nni.get_trial_id() self.exp_id = nni.get_experiment_id() self.scoring = accuracy_score
def get_nni_or_mlflow_experiment_and_trial() -> Tuple[Optional[str], Optional[str]]: """ Helper function which returns NNI experiment name and trial ID if NNI isn't in Standalone mode or, otherwise, returns MLFlow experiment name and run ID if there is an active MLFlow run. Returns (None, None) if NNI is in standalone mode and there is no active MLFLow run. """ if is_nni_run_standalone(): exp, run = deepcv.utils.mlflow_get_experiment_run_info() return (None, None) if exp is None else (exp.name, str(run.run_id)) return (nni.get_experiment_id(), nni.get_trial_id())
def setup_experiment( runtime_config: RuntimeConfig, enable_nni: bool = False, logger_blacklist: Optional[List[str]] = None) -> RuntimeConfig: if logger_blacklist is None: logger_blacklist = ['numba'] setup_distributed_training() seed_everything(runtime_config.seed) if runtime_config.output_dir is None: if 'PT_OUTPUT_DIR' in os.environ: runtime_config.output_dir = Path(os.environ['PT_OUTPUT_DIR']) else: runtime_config.output_dir = Path('./outputs') if enable_nni: import nni if nni.get_experiment_id() != 'STANDALONE': runtime_config.output_dir = runtime_config.output_dir / nni.get_experiment_id( ) / str(nni.get_sequence_id()) runtime_config.output_dir.mkdir(exist_ok=True) if runtime_config.checkpoint_dir is None: runtime_config.checkpoint_dir = runtime_config.output_dir / 'checkpoints' runtime_config.checkpoint_dir.mkdir(exist_ok=True) if runtime_config.tb_log_dir is None: runtime_config.tb_log_dir = runtime_config.output_dir / 'tb' runtime_config.tb_log_dir.mkdir(exist_ok=True) reset_logger() setup_logger( '', log_file=(runtime_config.output_dir / 'stdout.log').as_posix(), log_level=logging.DEBUG if runtime_config.debug else logging.INFO) for logger in logger_blacklist: mute_logger(logger) global _runtime_config _runtime_config = runtime_config return runtime_config
def prepare_hyper_search(cfg_kwargs: dict, reporthook=None, final_reporthook=None, primary_key=None, max_key=True, reporter_cls=None, with_keys: (list, str, None) = None, final_keys: (list, str, None) = None, dump=False, disable=False): """ Updated in v1.3.18 从 nni package 中获取超参,更新配置文件参数。当 nni 不可用或不是 nni 搜索模式时,参数将不会改变。 .. code-block :: python cfg_kwargs, reporthook, final_reporthook, tag = prepare_hyper_search( cfg_kwargs, Configuration, reporthook, final_reporthook, primary_key="macro_avg:f1" ) _cfg = Configuration(**cfg_kwargs) model = Model(_cfg) ... for epoch in range(_cfg.begin_epoch, _cfg.end_epoch): for batch_data in dataset: train_model(batch_data) data = evaluate_model() reporthook(data) final_reporthook() Parameters ---------- cfg_kwargs: dict 待传入cfg的参数 reporthook final_reporthook primary_key: 评估模型用的主键, ``nni.report_intermediate_result`` 和 ``nni.report_final_result`` 中 ``metric`` 的 ``default`` max_key: bool 主键是越大越好 reporter_cls with_keys: list or str 其它要存储的 metric,final report时默认为 primary_key 最优时指标 final_keys: list or str with_keys 中使用最后一个 report result 而不是 primary_key 最优时指标 dump: bool 为 True 时,会修改 配置文件 中 workspace 参数为 ``workspace/nni.get_experiment_id()/nni.get_trial_id()`` 使得 nni 的中间结果会被存储下来。 disable Returns ------- cfg_kwargs: dict 插入了nni超参后的配置文件参数 reporthook: function 每个iteration结束后的回调函数,用来报告中间结果。 默认 ``nni.report_intermediate_result``。 final_reporthook: 所有iteration结束后的回调函数,用来报告最终结果。 默认 ``nni.report_final_result`` dump: bool 和传入参数保持一致 Examples -------- .. code-block :: python class CFG(Configuration): hyper_params = {"hidden_num": 100} learning_rate = 0.001 workspace = "" cfg_kwargs, reporthook, final_reporthook, dump = prepare_hyper_search( {"learning_rate": 0.1}, CFG, primary_key="macro_avg:f1", with_keys="accuracy" ) # cfg_kwargs: {'learning_rate': 0.1} when nni start (e.g., using ``nni create --config _config.yml``), suppose in ``_config.yml``: .. code-block: yml searchSpacePath: _search_space.json and in ``_search_space.json`` .. code-block :: json { "hidden_num": {"_type": "choice", "_value": [500, 600, 700, 835, 900]}, } one of the return cfg_kwargs is ``{'hyper_params': {'hidden_num': 50}, 'learning_rate': 0.1}`` """ if disable: return cfg_kwargs, None, None, None try: import nni from nni import get_next_parameter, report_intermediate_result, report_final_result assert primary_key is not None def _as_key_list(_keys: (list, str, None)): if isinstance(_keys, str): if ";" in _keys: _keys = _keys.split(";") else: _keys = [_keys] elif isinstance(_keys, list): pass elif _keys is None: _keys = [] return _keys with_keys = _as_key_list(with_keys) final_keys = _as_key_list(final_keys) class Reporter(BaseReporter): def __init__(self): self.datas = [] def intermediate(self, data): feed_dict = { 'default': float(get_by_key(data, key_parser(primary_key))), primary_key: get_by_key(data, key_parser(primary_key)) } for key in with_keys: feed_dict[key] = get_by_key(data, key_parser(key)) report_intermediate_result(feed_dict) self.datas.append(data) def final(self): best_fn = get_min if max_key is False else get_max _with_keys = (with_keys if with_keys else []) + [primary_key] _final_keys = set(final_keys if final_keys else []) final_result = best_fn( self.datas, primary_key, with_keys=";".join(_with_keys), merge=False ) feed_dict = { 'default': float(final_result[0][primary_key]) } appendix_dict = dict(final_result[1][primary_key]) for key in _with_keys: if key in _final_keys: feed_dict[key] = get_by_key(self.datas[-1], key_parser(key)) else: feed_dict[key] = appendix_dict[key] report_final_result(feed_dict) rc = Reporter() if reporter_cls is None else reporter_cls reporthook = reporthook if reporthook is not None else rc.intermediate final_reporthook = final_reporthook if final_reporthook is not None else rc.final cfg_cls_params = get_params(get_next_parameter()) using_nni_tag = True if cfg_cls_params else False nested_update(cfg_kwargs, cfg_cls_params) if using_nni_tag is True and dump is True: # pragma: no cover cfg_kwargs["workspace"] = cfg_kwargs.get("workspace", "") + path_append( nni.get_experiment_id(), nni.get_trial_id(), to_str=True ) return cfg_kwargs, reporthook, final_reporthook, dump except ModuleNotFoundError: # pragma: no cover warnings.warn("nni package not found, skip") return cfg_kwargs, reporthook, final_reporthook, dump
def train_eval(esargs, RCV_CONFIG, seqid): """ train and eval the model """ global net global best_acc global bs_explore global gpus global hp_path best_acc = 0 parse_rev_args(RCV_CONFIG, esargs) # train procedure trial_id = nni.get_trial_id() available_devices = os.environ["CUDA_VISIBLE_DEVICES"] gpus = len(available_devices.split(",")) is_training = True filenames = ds.get_filenames(args.train_data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) dataset = dataset.flat_map(tf.data.TFRecordDataset) ds_train = ds.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=bs_explore, shuffle_buffer=shuffle_buffer, parse_record_fn=ds.parse_record, num_epochs=args.epochs, npc=args.num_parallel_calls, num_gpus=gpus, examples_per_epoch=examples_per_epoch if is_training else None, dtype=tf.float32) is_training = False filenames = ds.get_filenames(args.val_data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) dataset = dataset.flat_map(tf.data.TFRecordDataset) ds_val = ds.process_record_dataset(dataset=dataset, is_training=is_training, batch_size=bs_explore, shuffle_buffer=shuffle_buffer, parse_record_fn=ds.parse_record, num_epochs=args.epochs, npc=args.num_parallel_calls, num_gpus=gpus, examples_per_epoch=None, dtype=tf.float32) # run epochs and patience loopnum = seqid // args.slave patience = min(int(6 + (2 * loopnum)), 20) if loopnum == 0: run_epochs = int(args.warmup_1) elif loopnum == 1: run_epochs = int(args.warmup_2) elif loopnum == 2: run_epochs = int(args.warmup_3) else: run_epochs = int(args.epochs) # if loopnum < 4: # patience = int(8 + (2 * loopnum)) # run_epochs = int(10 + (20 * loopnum)) # else: # patience = 16 # run_epochs = args.epochs # lr strategy def scheduler2(epoch): lr_max = args.initial_lr total_epochs = args.epochs lr_each_epoch = lr_max - lr_max * epoch / total_epochs return lr_each_epoch callback = tf.keras.callbacks.LearningRateScheduler(scheduler2) # save weights checkpoint_dir = os.environ["HOME"] + "/nni/experiments/" + str( nni.get_experiment_id()) + "/checkpoint/" + str(nni.get_trial_id()) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) checkpoint_filepath = checkpoint_dir + "/weights." + "epoch." + str( run_epochs) + ".hdf5" model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_filepath, monitor='val_accuracy', mode='max', save_best_only=True, save_freq='epoch', save_weights_only=True, ) history = net.fit(ds_train, epochs=run_epochs, steps_per_epoch=Ntrain // bs_explore // gpus, validation_data=ds_val, validation_steps=Nvalidation // bs_explore // gpus, verbose=1, shuffle=False, callbacks=[ SendMetrics(hp_path), callback, EarlyStopping(min_delta=0.001, patience=patience), model_checkpoint_callback ]) # trial report final acc to tuner acc = 0 acc_list = history.history['val_accuracy'] for acc_n in acc_list: if float(acc_n) > acc: acc = float(acc_n) try: # predict acc if run_epochs >= 10 and run_epochs < 80: epoch_x = range(1, len(acc_list) + 1) pacc = utils.predict_acc(trial_id, epoch_x, acc_list, 90, True) best_acc = float(pacc) except Exception as E: print("Predict failed.") if acc > best_acc: best_acc = acc logger.debug("Final result is: %.3f", acc) return best_acc, history.epoch[-1]
except Exception as E: print("Predict failed.") if acc > best_acc: best_acc = acc logger.debug("Final result is: %.3f", acc) return best_acc, history.epoch[-1] if __name__ == "__main__": example_start_time = time.time() net = None args = get_args() try: experiment_path = os.environ[ "HOME"] + "/mountdir/nni/experiments/" + str( nni.get_experiment_id()) lock = multiprocessing.Lock() context = zmq.Context() socket = context.socket(zmq.REQ) tmpstr = 'tcp://' + args.ip + ':800081' socket.connect(tmpstr) os.makedirs(experiment_path + "/trials/" + str(nni.get_trial_id())) get_next_parameter_start = time.time() nni.get_next_parameter(socket) get_next_parameter_end = time.time() while True: lock.acquire() with open(experiment_path + "/graph.txt", "a+") as f: f.seek(0)
def test_get_experiment_id(self): self.assertEqual(nni.get_experiment_id(), 'fakeidex')
class ClassifyParam: local_model_path = os.path.join( 'data', 'cache', 'classify_{}_{}.model'.format(nni.get_experiment_id(), nni.get_trial_id())) top_n_list = list(range(1, 11)) + [15, 20]
def _start_mlflow_run(self, run_params: Dict[str, Any], pipeline: Pipeline): """ Log basic informations to MLFlow about pipeline if this pipeline is tagged with 'train' (creates a new MLFLow experiment and/or run named after training pipeline if it doesn't exists yet) NOTE: If NNI is in dry run mode (mode used to generate NNI Classic NAS search space JSON file from a model which contains NNI NAS Mutables `LayerChoice` and/or `InputChoice`) we avoid creating any new MLFlow experiment/run nor logging anything else to mlflow during this dry run """ node_tags = functools.reduce(set.union, [n.tags for n in pipeline.nodes]) if not deepcv.meta.nni_tools.is_nni_gen_search_space_mode() and ( 'train' in run_params['tags'] or 'train' in node_tags): if mlflow.active_run() is None: # Create MLFlow run in an experiment named after pipeline involved in training and log various pipeline/datasets informations to mlflow. If we are running an NNI hp/nas search, mlflow experiment and run will be named after NNI experiment and trial ids for better consitency. # TODO: find another way to name experiment as pipeline name is only available when running `kedro run --pipeline=<pipeline_name>` (e.g. special tag to node after which experiment is named) if not deepcv.meta.nni_tools.is_nni_run_standalone( ): # 'STANDALONE' is NNI default experiment ID if python process haven't been started by NNI nni_experiment = nni.get_experiment_id() mlflow.set_experiment(nni_experiment) mlflow.start_run(run_name=nni.get_trial_id()) # Flag indicating whether we are using NNI HP or Classic NAS API (Hyperparameter and/or Classic Neural Architecture search using NNI) mlflow.set_tag('nni_standalone_mode', False) mlflow.set_tag('nni_experiment_id', nni_experiment) mlflow.set_tag('nni_trial_id', nni.get_trial_id()) mlflow.set_tag('nni_sequence_id', nni.get_sequence_id()) else: pipeline_name = run_params['pipeline_name'].lower( ) if run_params['pipeline_name'] else 'default' mlflow.set_experiment( f'{self.project_ctx.project_name.lower()}_{pipeline_name}' ) mlflow.start_run( run_name= f'{pipeline_name.lower()}_run_{run_params["run_id"]}') mlflow.set_tag('nni_standalone_mode', True) # Log basic informations about Kedro training pipeline to mlflow mlflow.set_tags({ f'kedro_node_tag_{i}': tag for i, tag in enumerate(node_tags) }) mlflow.log_params({n: v for n, v in run_params.items() if v}) mlflow.log_param('pipeline.json', pipeline.to_json()) mlflow.log_param('pipeline.describe', pipeline.describe()) mlflow.log_param('pipeline.pipeline_datasets', pipeline.data_sets()) """ The following code creates special mlflow tags about current repository infos, which is not done by mlflow when starting an MLFlow run from code instead of from `mlflow run` command Code inspired from [`mlflow.projects._create_run`](https://www.mlflow.org/docs/latest/_modules/mlflow/projects.html) which doesn't seems to be called by `mlflow.start_run` """ tags = { mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME: self.project_ctx.package_name, mlflow.utils.mlflow_tags.MLFLOW_SOURCE_TYPE: mlflow.entities.SourceType.to_string( mlflow.entities.SourceType.PROJECT), mlflow.utils.mlflow_tags.MLFLOW_PROJECT_ENTRY_POINT: inspect.getsourcefile(type(self.project_ctx)) } try: repo = git.Repo(self.project_ctx.project_path, search_parent_directories=True) git_repo_url = repo.remote( ).url if 'origin' in repo.remotes else ( repo.remotes[0].url if len(repo.remotes) > 0 else '') git_repo_url = re.sub( r'git@([.\w]+):', r'https://\1/', git_repo_url).rstrip( '.git') # Convert SSH git URL to http URL mlflow.log_param( 'commit_url', git_repo_url + f'/commit/{repo.head.commit.hexsha}/') # We also set MLFLOW_SOURCE_NAME to repo URL so that MLFlow web UI is able to parse it and render commit and source hyperlinks (MLFLow only supports github URLs for now) tags.update({ mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME: git_repo_url if git_repo_url else self.project_ctx.project_name, mlflow.utils.mlflow_tags.MLFLOW_GIT_BRANCH: repo.active_branch.name, mlflow.utils.mlflow_tags.MLFLOW_GIT_REPO_URL: git_repo_url, mlflow.utils.mlflow_tags.MLFLOW_GIT_COMMIT: repo.head.commit.hexsha }) # Change mlflow user to be git repository user instead of system user (if any git user is specified) git_config_reader = repo.config_reader() git_config_reader.read() user = git_config_reader.get_value('user', 'name', default=None) email = git_config_reader.get_value('user', 'email', default=None) if user or email: tags[mlflow.utils.mlflow_tags.MLFLOW_USER] = ( str(user) + (f' <{email}>' if email else '') ) if user else str(email) except (ImportError, OSError, ValueError, IOError, KeyError, git.GitError, configparser.Error) as e: logging.warning( f'Failed to import Git or to get repository informations. Error: {e}' ) mlflow.set_tags(tags)
def train_search(config, params=None, warm_start_NN=None, restore_old_checkpoint=False, workers=1, verbosity=0): """ train_search is practically the same as the train function from training_torch, just made for NNI experiments :param config: :param params: :param warm_start_NN: :param restore_old_checkpoint: :param workers: :param verbosity: :return: """ if verbosity == 0: logger.setLevel(logging.INFO) if verbosity >= 1: logger.setLevel(logging.DEBUG) start = time.time() logger.info('Preparing Datasets') train_dataset, validation_dataset = prepare_dataset_torch(config) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True) test_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=params['batch_size'], shuffle=True) logger.info('Initializing Torch Network') net = map_model(config, params) logger.info('Optimizer Initialize') optimizer = map_optimizer(params['optimizer'], net.parameters(), params['learning_rate']) loss_func = map_loss_func(params['loss']) criterion = torch.nn.MSELoss() if config['scheduler']: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=config['scheduler_milestones'], gamma=0.1) else: scheduler = None epochs = config['epochs'] # Track the losses to determine early stopping avg_train_loss = [] avg_valid_loss = [] # initalize the early_stopping object early_stopping = EarlyStopping(verbose=True, trace_func=logger.info) logger.info('Start Training!') for epoch in range(epochs): train_loss, validation_loss, RMSE = train_epoch( net, optimizer, loss_func, train_loader=train_loader, test_loader=test_loader, scheduler=scheduler, criterion=criterion) nni.report_intermediate_result(-math.log10(RMSE)) if early_stopping is not None: early_stopping(validation_loss, net, RMSE) RMSE = early_stopping.RMSE avg_train_loss.append(train_loss) avg_valid_loss.append(validation_loss) logger.info( 'Epoch {}; Train Loss: {:.5}; Valid Loss: {:.5}; Best Validation RMSE: {:.5}' .format(epoch, train_loss, validation_loss, RMSE)) print( 'Epoch {}; Train Loss: {:.5}; Valid Loss: {:.5}; Validation RMSE: {:.5}' .format(epoch, train_loss, validation_loss, RMSE)) if early_stopping.early_stop: logger.info('Early Stopping') RMSE = early_stopping.RMSE break nni.report_final_result(-math.log10(RMSE)) end = time.time() logger.info( 'Training Completed: Time elapsed: {:.2} Seconds'.format(end - start)) plot_against_scaling(net, validation_dataset, criterion, trial_id=str(nni.get_trial_id()), exp_id=str(nni.get_experiment_id()))
def is_nni_run_standalone() -> bool: """ Simple helper function which returns whether NNI is in standalone trial run mode """ return nni.get_experiment_id() == r'STANDALONE' and nni.get_trial_id() == r'STANDALONE' and nni.get_sequence_id() == 0
def generate_parameters(self, parameter_id, **kwargs): """ Returns a set of trial neural architecture, as a serializable object. Parameters ---------- parameter_id : int """ #If there is no history, slave node will use the fake model. if not self.history: print("If there is no history, generate_parameters should not be called!") exit(1) total_start=time.time() rate = 1 if (os.path.exists(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time") and os.path.exists(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/train_time")): with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time", "r") as f: generate_time = float(f.read()) with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/train_time", "r") as f: train_time = float(f.read()) if (generate_time != 0) and (train_time != 0): realrate = int(train_time / generate_time) if (realrate < 5) and (realrate > 1): rate = int(realrate) if (realrate <= 1): rate = 1 for i in range(rate): start=time.time() new_father_id = None generated_graph = None if not self.training_queue: new_father_id, generated_graph = self.generate() father_id,json_out,new_model_id = self.total_data[parameter_id] self.training_queue.append((generated_graph, new_father_id, new_model_id)) #self.descriptors.append(generated_graph.extract_descriptor()) else: print("training_queue should be an empty list.") exit(1) graph, father_id, model_id = self.training_queue.pop(0) # from graph to json json_model_path = os.path.join(self.path, str(model_id) + ".json") json_out = graph_to_json(graph, json_model_path) end=time.time() #self.total_data[parameter_id] = (json_out, father_id, model_id) json_and_id="json_out="+str(json_out)+"+father_id="+str(father_id)+"+parameter_id="+str(parameter_id)+"+history="+"True" lock.acquire() with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/trials/" + str(nni.get_trial_id()) + "/output.log","a+") as f: f.write("single_generate=" + str(end - start)+"\n") with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/graph.txt","a+") as f: f.write(json_and_id+"\n") lock.release() total_end=time.time() lock.acquire() with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/trials/" + str(nni.get_trial_id()) + "/output.log","a+") as f: f.write("total_generate=" + str(total_end - total_start)+"\n") lock.release() totime = total_end - total_start if totime<0: totime = 0-totime with open (os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time","w+") as f: gt = totime/rate f.write(str(gt))
def main(self, hp_params): model_args = self.model_args data_args = self.data_args training_args = self.training_args # arguments manipulation if nni.get_experiment_id() != 'STANDALONE': training_args.output_dir = f"{training_args.output_dir}/{nni.get_experiment_id()}-{nni.get_trial_id()}" model_args.model_name_or_path = hp_params['backbone'] training_args.learning_rate = hp_params['learning_rate'] training_args.seed = hp_params['seed'] if hp_params["max_seq_length"] > 384: training_args.per_device_train_batch_size = 2 training_args.per_device_eval_batch_size = 2 training_args.gradient_accumulation_steps = 16 data_args.max_seq_length = hp_params["max_seq_length"] else: data_args.max_seq_length = hp_params["max_seq_length"] # get token classification task instance module = import_module("tasks") try: token_classification_task_clazz = getattr(module, model_args.task_type) token_classification_task: TokenClassificationTask = token_classification_task_clazz( ) except AttributeError: raise ValueError( f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. " f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}" ) # label labels = token_classification_task.get_labels(data_args.labels) label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) self.label_map = label_map # load pretrained model and tokenizer config = self._custom_config( model_args=model_args, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}) tokenizer = self._custom_tokenizer(model_args=model_args) model = self._custom_model(model_args=model_args, config=config) # config = AutoConfig.from_pretrained( # model_args.config_name if model_args.config_name else model_args.model_name_or_path, # num_labels=num_labels, # id2label=label_map, # label2id={label: i for i, label in enumerate(labels)}, # cache_dir=model_args.cache_dir, # ) # tokenizer = AutoTokenizer.from_pretrained( # model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, # cache_dir=model_args.cache_dir, # use_fast=model_args.use_fast, # ) # model = AutoModelForTokenClassification.from_pretrained( # model_args.model_name_or_path, # from_tf=bool(".ckpt" in model_args.model_name_or_path), # config=config, # cache_dir=model_args.cache_dir, # ) # get dataset and data_collator train_dataset = (TokenClassificationDataset( token_classification_task=token_classification_task, data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None) eval_dataset = (TokenClassificationDataset( token_classification_task=token_classification_task, data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None) data_collator = DataCollatorForTokenClassification(tokenizer) # callbacks callbacks = [ NNiCallback(hp_metric=training_args.metric_for_best_model, greater_is_better=training_args.greater_is_better) ] # reset logging, eval and save step as EPOCHS explicitly steps_per_epoch = int( np.ceil( len(train_dataset) / (training_args.train_batch_size * training_args.gradient_accumulation_steps))) training_args.logging_steps = steps_per_epoch training_args.save_steps = steps_per_epoch training_args.eval_steps = steps_per_epoch # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, callbacks=callbacks, compute_metrics=self.compute_metrics, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) # Predict if training_args.do_predict: test_dataset = TokenClassificationDataset( token_classification_task=token_classification_task, data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, ) predictions, label_ids, metrics = trainer.predict(test_dataset) preds_list, _ = self.align_predictions(predictions, label_ids) output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_master(): with open(output_test_results_file, "w") as writer: for key, value in metrics.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) # Save predictions output_test_predictions_file = os.path.join( training_args.output_dir, "test_predictions.txt") if trainer.is_world_master(): with open(output_test_predictions_file, "w") as writer: with open(os.path.join(data_args.data_dir, "test.json"), "r") as f: docs = json.load(f) for doc, preds in zip(docs, preds_list): text = doc['_source']['text'] labels = doc['_source']['label_list'] preds = ' '.join(preds) print(f"{text}\t{labels}\t{preds}", file=writer) # nni final result if training_args.greater_is_better: nni.report_final_result(max(METRICS)) else: nni.report_final_result(min(METRICS))
def get_experiment_id() -> str: return nni.get_experiment_id()
# params["gs_research_workflow.time_series.gs_steps.model_steps:FitStep > epochs "] = 1 # params["gs_research_workflow.time_series.gs_steps.model_steps:FitStep > steps_per_epoch "] = 1 # params["gs_research_workflow.time_series.gs_steps.model_steps:FitStep > validation_steps "] = 1 # params["gs_research_workflow.time_series.models.inception_time:InceptionTime.HP > depth"] = 5 # params["gs_research_workflow.time_series.models.inception_time:InceptionTime.HP > use_residual"] = True if cfg_alias_cls: params = { cfg_alias_cls.get_cfg_loc(k): v for k, v in params.items() } trial_uuid = generate_uuid() experiment_id = generate_uuid() else: os.environ[ENV_KEY_TRIAL_IN_NNI] = "1" params = nni.get_next_parameter() experiment_id = nni.get_experiment_id() trial_uuid = nni.get_trial_id() if cfg_alias_cls: params = { cfg_alias_cls.get_cfg_loc(k): v for k, v in params.items() } # 对 item 进行 unescape params = {k: unescape_nni_choice_item(v) for k, v in params.items()} yml_path = os.path.join(os.path.dirname(__file__), "../../..", args.cfg) if not os.path.isfile(yml_path): logger.error(f"Default cfg file {yml_path} is not existed!") sys.exit(0) trial_task = HPOTrialPodSideEnv(args.name, yml_path, params, trial_uuid,
class NextLocParam: local_model_path = os.path.join( 'data', 'cache', 'next_loc_{}_{}.model'.format(nni.get_experiment_id(), nni.get_trial_id())) local_result_path = os.path.join('data', 'cache', 'next_loc_result.h5') top_n_list = [1, 2, 3, 4, 5, 10, 20]