def _finalize_experiment( experiment_json, metric, app_id, run_id, state, duration, logdir, best_logdir, optimization_key, ): """Attaches the experiment outcome as xattr metadata to the app directory. """ outputs = _build_summary_json(logdir) if outputs: hopshdfs.dump(outputs, logdir + "/.summary.json") if best_logdir: experiment_json["bestDir"] = best_logdir[len(hopshdfs.project_path()):] experiment_json["optimizationKey"] = optimization_key experiment_json["metric"] = metric experiment_json["state"] = state experiment_json["duration"] = duration experiment_utils._attach_experiment_xattr(app_id, run_id, experiment_json, "REPLACE")
def _handle_return_simple(retval, hdfs_exec_logdir, logfile): """ Args: val: hdfs_exec_logdir: Returns: """ return_file = hdfs_exec_logdir + '/.outputs.json' if not retval: if logfile is not None: retval = {'log': logfile} hdfs.dump(dumps(retval), return_file) return _upload_file_output(retval, hdfs_exec_logdir) # Validation if type(retval) is not dict: try: retval = {'metric': retval} except: pass retval['log'] = hdfs_exec_logdir.replace(hdfs.project_path(), '') + '/output.log' hdfs.dump(dumps(retval), return_file)
def write_featureframe(self): """ Writes a dataframe of data as a training dataset on HDFS in the hdf5 format Returns: None Raises: :ValueError: if the user supplied a write mode that is not supported :HDF5DatasetFormatNotSupportedForExternalTrainingDatasets: if the user tries to write an external training dataset in the .hdf5 format. """ if self.training_dataset.training_dataset_type == constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE: raise HDF5DatasetFormatNotSupportedForExternalTrainingDatasets("The .hdf5 dataset format is not " "supported for external training datasets.") if (self.write_mode == constants.SPARK_CONFIG.SPARK_APPEND_MODE): raise ValueError( "Append is not supported for training datasets stored in .hdf5 format, only overwrite, " "set the optional argument write_mode='overwrite'") if not isinstance(self.df, np.ndarray): if isinstance(self.df, DataFrame) or isinstance(self.df, RDD): df = np.array(self.df.collect()) if isinstance(self.df, pd.DataFrame): df = df.values if isinstance(self.df, list): df = np.array(self.df) tf = TemporaryFile() tf.seek(0) hdf5_file = h5py.File(tf) tf.seek(0) hdf5_file.create_dataset(self.training_dataset.name, data=df) tf.seek(0) hdf5_file.close() tf.seek(0) hdfs.dump(tf.read(), self.path + constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX)
def init_logger(self, trial_log_file): """Initializes the trial log file """ self.trial_log_file = trial_log_file # Open trial log file descriptor if not hopshdfs.exists(self.trial_log_file): hopshdfs.dump("", self.trial_log_file) self.trial_fd = hopshdfs.open_file(self.trial_log_file, flags="w")
def _create_experiment_subdirectories(app_id, run_id, param_string, type, sub_type=None, params=None): """ Creates directories for an experiment, if Experiments folder exists it will create directories below it, otherwise it will create them in the Logs directory. Args: :app_id: YARN application ID of the experiment :run_id: Experiment ID :param_string: name of the new directory created under parent directories :type: type of the new directory parent, e.g differential_evolution :sub_type: type of sub directory to parent, e.g generation :params: dict of hyperparameters Returns: The new directories for the yarn-application and for the execution (hdfs_exec_logdir, hdfs_appid_logdir) """ pyhdfs_handle = hdfs.get() hdfs_events_parent_dir = hdfs.project_path() + "Experiments" hdfs_experiment_dir = hdfs_events_parent_dir + "/" + app_id + "_" + str( run_id) # determine directory structure based on arguments if sub_type: hdfs_exec_logdir = hdfs_experiment_dir + "/" + str( sub_type) + '/' + str(param_string) if pyhdfs_handle.exists(hdfs_exec_logdir): hdfs.delete(hdfs_exec_logdir, recursive=True) elif not param_string and not sub_type: if pyhdfs_handle.exists(hdfs_experiment_dir): hdfs.delete(hdfs_experiment_dir, recursive=True) hdfs_exec_logdir = hdfs_experiment_dir + '/' else: hdfs_exec_logdir = hdfs_experiment_dir + '/' + str(param_string) if pyhdfs_handle.exists(hdfs_exec_logdir): hdfs.delete(hdfs_exec_logdir, recursive=True) # Need to remove directory if it exists (might be a task retry) # create the new directory pyhdfs_handle.create_directory(hdfs_exec_logdir) return_file = hdfs_exec_logdir + '/.hparams.json' hdfs.dump(dumps(params), return_file) return hdfs_exec_logdir, hdfs_experiment_dir
def initialize_logger(self, exp_dir): """Initialize logger of optimizer :param exp_dir: path of experiment directory :rtype exp_dir: str """ # configure logger self.log_file = exp_dir + "/pruner.log" if not hdfs.exists(self.log_file): hdfs.dump("", self.log_file) self.fd = hdfs.open_file(self.log_file, flags="w") self._log("Initialized Pruner Logger")
def _handle_return(retval, hdfs_exec_logdir, optimization_key, logfile): """ Args: val: hdfs_exec_logdir: Returns: """ _upload_file_output(retval, hdfs_exec_logdir) # Validation if not optimization_key and type(retval) is dict and len( retval.keys()) > 1: raise Exception( 'Missing optimization_key argument, when returning multiple values in a dict the optimization_key argument must be set.' ) elif type(retval) is dict and optimization_key not in retval and len( retval.keys()) >= 1: raise Exception( 'optimization_key not in returned dict, when returning multiple values in a dict the optimization_key argument must be set to indicate which key the optimization algorithm should maximize or minimize on.' ) elif type(retval) is dict and len(retval.keys()) == 0: raise Exception( 'Returned dict is empty, must contain atleast 1 metric to maximize or minimize.' ) # Validate that optimization_key is a number if type(retval) is dict and len(retval.keys()) > 1: opt_val = retval[optimization_key] opt_val = _validate_optimization_value(opt_val) retval[optimization_key] = opt_val elif type(retval) is dict and len(retval.keys()) == 1: opt_val = retval[list(retval.keys())[0]] opt_val = _validate_optimization_value(opt_val) retval[list(retval.keys())[0]] = opt_val else: opt_val = _validate_optimization_value(retval) retval = {'metric': opt_val} retval['log'] = logfile return_file = hdfs_exec_logdir + '/.outputs.json' hdfs.dump(dumps(retval), return_file) metric_file = hdfs_exec_logdir + '/.metric' hdfs.dump(str(opt_val), metric_file)
def _store_tf_record_schema_hdfs(tfrecord_schema, hdfs_path): """ Stores a tfrecord json schema to HDFS Args: :tfrecord_schema: the tfrecord schema to store :hdfs_path: the hdfs path to store it Returns: None """ json_str = json.dumps(tfrecord_schema) hdfs.dump( json_str, hdfs_path + constants.DELIMITERS.SLASH_DELIMITER + constants.FEATURE_STORE.TRAINING_DATASET_TF_RECORD_SCHEMA_FILE_NAME)
def _finalize_experiment(experiment_json, metric, app_id, run_id, state, duration, logdir, bestLogdir, optimization_key): summary_file = _build_summary_json(logdir) if summary_file: hdfs.dump(summary_file, logdir + '/.summary.json') if bestLogdir: experiment_json['bestDir'] = bestLogdir[len(hdfs.project_path()):] experiment_json['optimizationKey'] = optimization_key experiment_json['metric'] = metric experiment_json['state'] = state experiment_json['duration'] = duration _attach_experiment_xattr(app_id, run_id, experiment_json, 'REPLACE')
def __init__(self, log_file, partition_id, task_attempt, print_executor): self.metric = None self.lock = threading.RLock() self.stop = False self.trial_id = None self.trial_log_file = None self.logs = "" self.log_file = log_file self.partition_id = partition_id self.task_attempt = task_attempt self.print_executor = print_executor # Open executor log file descriptor # This log is for all maggy system related log messages if not hopshdfs.exists(log_file): hopshdfs.dump("", log_file) self.fd = hopshdfs.open_file(log_file, flags="w") self.trial_fd = None
def _handle_return_val(return_val, log_dir, optimization_key, log_file): """Handles the return value of the user defined training function. """ experiment_utils._upload_file_output(return_val, log_dir) # Return type validation if not optimization_key: raise ValueError("Optimization key cannot be None.") if not return_val: raise exceptions.ReturnTypeError(optimization_key, return_val) if not isinstance(return_val, constants.USER_FCT.RETURN_TYPES): raise exceptions.ReturnTypeError(optimization_key, return_val) if isinstance(return_val, dict) and optimization_key not in return_val: raise KeyError( "Returned dictionary does not contain optimization key with the " "provided name: {}".format(optimization_key)) # validate that optimization metric is numeric if isinstance(return_val, dict): opt_val = return_val[optimization_key] else: opt_val = return_val return_val = {optimization_key: opt_val} if not isinstance(opt_val, constants.USER_FCT.NUMERIC_TYPES): raise exceptions.MetricTypeError(optimization_key, opt_val) # for key, value in return_val.items(): # return_val[key] = value if isinstance(value, str) else str(value) return_val["log"] = log_file.replace(hopshdfs.project_path(), "") return_file = log_dir + "/.outputs.json" hopshdfs.dump(json.dumps(return_val, default=json_default_numpy), return_file) metric_file = log_dir + "/.metric" hopshdfs.dump(json.dumps(opt_val, default=json_default_numpy), metric_file) return opt_val
def _handle_return_simple(retval, hdfs_exec_logdir, logfile): """ Args: val: hdfs_exec_logdir: Returns: """ return_file = hdfs_exec_logdir + '/.outputs.json' _upload_file_output(retval, hdfs_exec_logdir) # Validation if type(retval) is not dict: try: retval = {'metric': retval} except: pass retval['log'] = logfile hdfs.dump(dumps(retval), return_file)
def dump(self, data, hdfs_path): return hopshdfs.dump(data, hdfs_path)
def _register(hdfs_exec_dir, endpoint_dir, exec_num, local_logdir=False): """ Args: hdfs_exec_dir: endpoint_dir: exec_num: local_logdir: Returns: """ global tb_pid if tb_pid != 0: subprocess.Popen(["kill", str(tb_pid)]) _reset_global() global events_logdir events_logdir = hdfs_exec_dir global local_logdir_bool local_logdir_bool = local_logdir if tb_pid == 0: global pypath pypath = os.getenv("PYSPARK_PYTHON") #find free port tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_socket.bind(('', 0)) global tb_port tb_addr, tb_port = tb_socket.getsockname() global tb_path tb_path = experiment_utils._find_tensorboard() tb_socket.close() tb_env = _init_tb_env() global local_logdir_path if local_logdir: local_logdir_path = os.getcwd() + '/local_logdir' if os.path.exists(local_logdir_path): shutil.rmtree(local_logdir_path) os.makedirs(local_logdir_path) else: os.makedirs(local_logdir_path) local_logdir_path = local_logdir_path + '/' tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % local_logdir_path, "--port=%d" % tb_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) else: tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % events_logdir, "--port=%d" % tb_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) tb_pid = tb_proc.pid host = socket.getfqdn() global tb_url tb_url = "http://{0}:{1}".format(host, tb_port) global endpoint endpoint = endpoint_dir + "/TensorBoard.task" + str(exec_num) #dump tb host:port to hdfs hopshdfs.dump(tb_url, endpoint) return endpoint, tb_pid
def _wrapper_fun(iter): """ Wraps the user supplied training function in order to be passed to the Spark Executors. Args: iter: Returns: """ experiment_utils._set_ml_id(app_id, run_id) # get task context information to determine executor identifier partition_id, task_attempt = util.get_partition_attempt_id() client = rpc.Client(server_addr, partition_id, task_attempt, hb_interval, secret) log_file = (log_dir + "/executor_" + str(partition_id) + "_" + str(task_attempt) + ".log") # save the builtin print original_print = __builtin__.print reporter = Reporter(log_file, partition_id, task_attempt, original_print) def maggy_print(*args, **kwargs): """Maggy custom print() function.""" original_print(*args, **kwargs) reporter.log(" ".join(str(x) for x in args), True) # override the builtin print __builtin__.print = maggy_print try: client_addr = client.client_addr host_port = client_addr[0] + ":" + str(client_addr[1]) exec_spec = {} exec_spec["partition_id"] = partition_id exec_spec["task_attempt"] = task_attempt exec_spec["host_port"] = host_port exec_spec["trial_id"] = None reporter.log("Registering with experiment driver", False) client.register(exec_spec) client.start_heartbeat(reporter) # blocking trial_id, parameters = client.get_suggestion(reporter) while not client.done: if experiment_type == "ablation": ablation_params = { "ablated_feature": parameters.get("ablated_feature", "None"), "ablated_layer": parameters.get("ablated_layer", "None"), } parameters.pop("ablated_feature") parameters.pop("ablated_layer") tb_logdir = log_dir + "/" + trial_id trial_log_file = tb_logdir + "/output.log" reporter.set_trial_id(trial_id) # If trial is repeated, delete trial directory, except log file if hopshdfs.exists(tb_logdir): util._clean_dir(tb_logdir, [trial_log_file]) else: hopshdfs.mkdir(tb_logdir) reporter.init_logger(trial_log_file) tensorboard._register(tb_logdir) if experiment_type == "ablation": hopshdfs.dump( json.dumps(ablation_params, default=util.json_default_numpy), tb_logdir + "/.hparams.json", ) else: hopshdfs.dump( json.dumps(parameters, default=util.json_default_numpy), tb_logdir + "/.hparams.json", ) try: reporter.log("Starting Trial: {}".format(trial_id), False) reporter.log("Trial Configuration: {}".format(parameters), False) if experiment_type == "optimization": tensorboard._write_hparams(parameters, trial_id) sig = inspect.signature(map_fun) if sig.parameters.get("reporter", None): retval = map_fun(**parameters, reporter=reporter) else: retval = map_fun(**parameters) if experiment_type == "optimization": tensorboard._write_session_end() retval = util._handle_return_val(retval, tb_logdir, optimization_key, trial_log_file) except exceptions.EarlyStopException as e: retval = e.metric reporter.log("Early Stopped Trial.", False) reporter.log("Finished Trial: {}".format(trial_id), False) reporter.log("Final Metric: {}".format(retval), False) client.finalize_metric(retval, reporter) # blocking trial_id, parameters = client.get_suggestion(reporter) except: # noqa: E722 reporter.log(traceback.format_exc(), False) raise finally: reporter.close_logger() client.stop() client.close()
def __init__(self, experiment_type, **kwargs): global driver_secret # COMMON EXPERIMENT SETUP self._final_store = [] self._trial_store = {} self.num_executors = kwargs.get("num_executors") self._message_q = queue.Queue() self.name = kwargs.get("name") self.experiment_done = False self.worker_done = False self.hb_interval = kwargs.get("hb_interval") self.description = kwargs.get("description") self.experiment_type = experiment_type self.es_interval = kwargs.get("es_interval") self.es_min = kwargs.get("es_min") # TYPE-SPECIFIC EXPERIMENT SETUP if self.experiment_type == "optimization": # set up an optimization experiment self.num_trials = kwargs.get("num_trials", 1) searchspace = kwargs.get("searchspace") if isinstance(searchspace, Searchspace): self.searchspace = searchspace elif searchspace is None: self.searchspace = Searchspace() else: raise Exception( "The experiment's search space should be an instance of maggy.Searchspace, " "but it is {0} (of type '{1}').".format( str(searchspace), type(searchspace).__name__)) optimizer = kwargs.get("optimizer") if optimizer is None: if len(self.searchspace.names()) == 0: self.optimizer = SingleRun() else: raise Exception( "Searchspace has to be empty or None to use without optimizer" ) elif isinstance(optimizer, str): if optimizer.lower() == "randomsearch": self.optimizer = RandomSearch() elif optimizer.lower() == "asha": self.optimizer = Asha() elif optimizer.lower() == "none": if len(self.searchspace.names()) == 0: self.optimizer = SingleRun() else: raise Exception( "Searchspace has to be empty or None to use without Optimizer." ) else: raise Exception( "Unknown Optimizer. Can't initialize experiment driver." ) elif isinstance(optimizer, AbstractOptimizer): self.optimizer = optimizer print("Custom Optimizer initialized.") else: raise Exception( "The experiment's optimizer should either be an string indicating the name " "of an implemented optimizer (such as 'randomsearch') or an instance of " "maggy.optimizer.AbstractOptimizer, " "but it is {0} (of type '{1}').".format( str(optimizer), type(optimizer).__name__)) direction = kwargs.get("direction", "max") if isinstance(direction, str) and direction.lower() in ["min", "max"]: self.direction = direction.lower() else: raise Exception( "The experiment's direction should be an string (either 'min' or 'max') " "but it is {0} (of type '{1}').".format( str(direction), type(direction).__name__)) es_policy = kwargs.get("es_policy") if isinstance(es_policy, str): if es_policy.lower() == "median": self.earlystop_check = MedianStoppingRule.earlystop_check elif es_policy.lower() == "none": self.earlystop_check = NoStoppingRule.earlystop_check else: raise Exception( "The experiment's early stopping policy should either be a string ('median' or 'none') " "or a custom policy that is an instance of maggy.earlystop.AbstractEarlyStop, " "but it is {0} (of type '{1}').".format( str(es_policy), type(es_policy).__name__)) elif isinstance(es_policy, AbstractEarlyStop): self.earlystop_check = es_policy.earlystop_check print("Custom Early Stopping policy initialized.") else: raise Exception( "The experiment's early stopping policy should either be a string ('median' or 'none') " "or a custom policy that is an instance of maggy.earlystop.AbstractEarlyStop, " "but it is {0} (of type '{1}').".format( str(es_policy), type(es_policy).__name__)) self.es_interval = kwargs.get("es_interval") self.es_min = kwargs.get("es_min") self.result = { "best_val": "n.a.", "num_trials": 0, "early_stopped": 0 } elif self.experiment_type == "ablation": # set up an ablation study experiment self.earlystop_check = NoStoppingRule.earlystop_check ablation_study = kwargs.get("ablation_study") if isinstance(ablation_study, AblationStudy): self.ablation_study = ablation_study else: raise Exception( "The experiment's ablation study configuration should be an instance of " "maggy.ablation.AblationStudy, " "but it is {0} (of type '{1}').".format( str(ablation_study), type(ablation_study).__name__)) searchspace = kwargs.get("searchspace") if not searchspace: self.searchspace = Searchspace() else: raise Exception( "The experiment's search space should be None for ablation experiments, " "but it is {0} (of type '{1}').".format( str(searchspace), type(searchspace).__name__)) ablator = kwargs.get("ablator") if isinstance(ablator, str): if ablator.lower() == "loco": self.ablator = LOCO(ablation_study, self._final_store) self.num_trials = self.ablator.get_number_of_trials() if self.num_executors > self.num_trials: self.num_executors = self.num_trials else: raise Exception( "The experiment's ablation study policy should either be a string ('loco') " "or a custom policy that is an instance of maggy.ablation.ablation.AbstractAblator, " "but it is {0} (of type '{1}').".format( str(ablator), type(ablator).__name__)) elif isinstance(ablator, AbstractAblator): self.ablator = ablator print("Custom Ablator initialized. \n") else: raise Exception( "The experiment's ablation study policy should either be a string ('loco') " "or a custom policy that is an instance of maggy.ablation.ablation.AbstractAblator, " "but it is {0} (of type '{1}').".format( str(ablator), type(ablator).__name__)) self.result = { "best_val": "n.a.", "num_trials": 0, "early_stopped": "n.a" } else: raise Exception( "Unknown experiment type. experiment_type should be either 'optimization' or 'ablation', " "but it is {0}.".format(str(self.experiment_type))) # FINALIZE EXPERIMENT SETUP self.server = rpc.Server(self.num_executors) if not driver_secret: driver_secret = self._generate_secret( ExperimentDriver.SECRET_BYTES) self._secret = driver_secret self.job_start = datetime.now() self.executor_logs = "" self.maggy_log = "" self.log_lock = threading.RLock() self.log_file = kwargs.get("log_dir") + "/maggy.log" self.log_dir = kwargs.get("log_dir") self.exception = None # Open File desc for HDFS to log if not hopshdfs.exists(self.log_file): hopshdfs.dump("", self.log_file) self.fd = hopshdfs.open_file(self.log_file, flags="w")
def _target_function(self): try: time_earlystop_check = ( time.time() ) # only used by earlystop-supporting experiments while not self.worker_done: trial = None # get a message try: msg = self._message_q.get_nowait() except queue.Empty: msg = {"type": None} if self.earlystop_check != NoStoppingRule.earlystop_check: if (time.time() - time_earlystop_check) >= self.es_interval: time_earlystop_check = time.time() # pass currently running trials to early stop component if len(self._final_store) > self.es_min: self._log("Check for early stopping.") try: to_stop = self.earlystop_check( self._trial_store, self._final_store, self.direction, ) except Exception as e: self._log(e) to_stop = [] if len(to_stop) > 0: self._log( "Trials to stop: {}".format(to_stop)) for trial_id in to_stop: self.get_trial(trial_id).set_early_stop() # depending on message do the work # 1. METRIC if msg["type"] == "METRIC": # append executor logs if in the message logs = msg.get("logs", None) if logs is not None: with self.log_lock: self.executor_logs = self.executor_logs + logs if msg["trial_id"] is not None and msg[ "data"] is not None: self.get_trial(msg["trial_id"]).append_metric( msg["data"]) # 2. BLACKLIST the trial elif msg["type"] == "BLACK": trial = self.get_trial(msg["trial_id"]) with trial.lock: trial.status = Trial.SCHEDULED self.server.reservations.assign_trial( msg["partition_id"], msg["trial_id"]) # 3. FINAL elif msg["type"] == "FINAL": # set status # get trial only once trial = self.get_trial(msg["trial_id"]) logs = msg.get("logs", None) if logs is not None: with self.log_lock: self.executor_logs = self.executor_logs + logs # finalize the trial object with trial.lock: trial.status = Trial.FINALIZED trial.final_metric = msg["data"] trial.duration = experiment_utils._seconds_to_milliseconds( time.time() - trial.start) # move trial to the finalized ones self._final_store.append(trial) self._trial_store.pop(trial.trial_id) # update result dictionary self._update_result(trial) # keep for later in case tqdm doesn't work self.maggy_log = self._update_maggy_log() self._log(self.maggy_log) hopshdfs.dump( trial.to_json(), self.log_dir + "/" + trial.trial_id + "/trial.json", ) # assign new trial if self.experiment_type == "optimization": trial = self.optimizer.get_suggestion(trial) elif self.experiment_type == "ablation": trial = self.ablator.get_trial(trial) if trial is None: self.server.reservations.assign_trial( msg["partition_id"], None) self.experiment_done = True elif trial == "IDLE": self.add_message({ "type": "IDLE", "partition_id": msg["partition_id"], "idle_start": time.time(), }) self.server.reservations.assign_trial( msg["partition_id"], None) else: with trial.lock: trial.start = time.time() trial.status = Trial.SCHEDULED self.server.reservations.assign_trial( msg["partition_id"], trial.trial_id) self.add_trial(trial) # 4. Let executor be idle elif msg["type"] == "IDLE": # execute only every 0.1 seconds but do not block thread if (self.experiment_type == "optimization" and time.time() - msg["idle_start"] > 0.1): trial = self.optimizer.get_suggestion() if trial is None: self.server.reservations.assign_trial( msg["partition_id"], None) self.experiment_done = True elif trial == "IDLE": # reset timeout msg["idle_start"] = time.time() self.add_message(msg) else: with trial.lock: trial.start = time.time() trial.status = Trial.SCHEDULED self.server.reservations.assign_trial( msg["partition_id"], trial.trial_id) self.add_trial(trial) elif self.experiment_type == "optimization": self.add_message(msg) # 4. REG elif msg["type"] == "REG": if self.experiment_type == "optimization": trial = self.optimizer.get_suggestion() elif self.experiment_type == "ablation": trial = self.ablator.get_trial() if trial is None: self.server.reservations.assign_trial( msg["partition_id"], None) self.experiment_done = True elif trial == "IDLE": # reset timeout msg["idle_start"] = time.time() self.add_message(msg) else: with trial.lock: trial.start = time.time() trial.status = Trial.SCHEDULED self.server.reservations.assign_trial( msg["partition_id"], trial.trial_id) self.add_trial(trial) except Exception as exc: # Exception can't be propagated to parent thread # therefore log the exception and fail experiment self._log(exc) self.exception = exc self.server.stop()
def finalize(self, job_end): results = "" if self.experiment_type == "optimization": _ = self.optimizer.finalize_experiment(self._final_store) self.job_end = job_end self.duration = experiment_utils._seconds_to_milliseconds( self.job_end - self.job_start) self.duration_str = experiment_utils._time_diff( self.job_start, self.job_end) results = ("\n------ " + self.optimizer.name() + " Results ------ direction(" + self.direction + ") \n" "BEST combination " + json.dumps(self.result["best_hp"]) + " -- metric " + str(self.result["best_val"]) + "\n" "WORST combination " + json.dumps(self.result["worst_hp"]) + " -- metric " + str(self.result["worst_val"]) + "\n" "AVERAGE metric -- " + str(self.result["avg"]) + "\n" "EARLY STOPPED Trials -- " + str(self.result["early_stopped"]) + "\n" "Total job time " + self.duration_str + "\n") elif self.experiment_type == "ablation": _ = self.ablator.finalize_experiment(self._final_store) self.job_end = job_end self.duration = experiment_utils._seconds_to_milliseconds( self.job_end - self.job_start) self.duration_str = experiment_utils._time_diff( self.job_start, self.job_end) results = ("\n------ " + self.ablator.name() + " Results ------ \n" + "BEST Config Excludes " + json.dumps(self.result["best_config"]) + " -- metric " + str(self.result["best_val"]) + "\n" + "WORST Config Excludes " + json.dumps(self.result["worst_config"]) + " -- metric " + str(self.result["worst_val"]) + "\n" + "AVERAGE metric -- " + str(self.result["avg"]) + "\n" + "Total Job Time " + self.duration_str + "\n") print(results) self._log(results) hopshdfs.dump( json.dumps(self.result, default=util.json_default_numpy), self.log_dir + "/result.json", ) sc = hopsutil._find_spark().sparkContext hopshdfs.dump(self.json(sc), self.log_dir + "/maggy.json") return self.result