Exemple #1
0
def _finalize_experiment(
    experiment_json,
    metric,
    app_id,
    run_id,
    state,
    duration,
    logdir,
    best_logdir,
    optimization_key,
):
    """Attaches the experiment outcome as xattr metadata to the app directory.
    """
    outputs = _build_summary_json(logdir)

    if outputs:
        hopshdfs.dump(outputs, logdir + "/.summary.json")

    if best_logdir:
        experiment_json["bestDir"] = best_logdir[len(hopshdfs.project_path()):]
    experiment_json["optimizationKey"] = optimization_key
    experiment_json["metric"] = metric
    experiment_json["state"] = state
    experiment_json["duration"] = duration

    experiment_utils._attach_experiment_xattr(app_id, run_id, experiment_json,
                                              "REPLACE")
Exemple #2
0
def _handle_return_simple(retval, hdfs_exec_logdir, logfile):
    """

    Args:
        val:
        hdfs_exec_logdir:

    Returns:

    """
    return_file = hdfs_exec_logdir + '/.outputs.json'

    if not retval:
        if logfile is not None:
            retval = {'log': logfile}
            hdfs.dump(dumps(retval), return_file)
        return

    _upload_file_output(retval, hdfs_exec_logdir)

    # Validation
    if type(retval) is not dict:
        try:
            retval = {'metric': retval}
        except:
            pass

    retval['log'] = hdfs_exec_logdir.replace(hdfs.project_path(),
                                             '') + '/output.log'

    hdfs.dump(dumps(retval), return_file)
Exemple #3
0
    def write_featureframe(self):
        """
        Writes a dataframe of data as a training dataset on HDFS in the hdf5 format

        Returns:
            None

        Raises:
              :ValueError: if the user supplied a write mode that is not supported
              :HDF5DatasetFormatNotSupportedForExternalTrainingDatasets: if the user tries to write an
                                                                          external training dataset in the .hdf5 format.
        """
        if self.training_dataset.training_dataset_type == constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            raise HDF5DatasetFormatNotSupportedForExternalTrainingDatasets("The .hdf5 dataset format is not "
                                                                           "supported for external training datasets.")
        if (self.write_mode == constants.SPARK_CONFIG.SPARK_APPEND_MODE):
            raise ValueError(
                "Append is not supported for training datasets stored in .hdf5 format, only overwrite, "
                "set the optional argument write_mode='overwrite'")
        if not isinstance(self.df, np.ndarray):
            if isinstance(self.df, DataFrame) or isinstance(self.df, RDD):
                df = np.array(self.df.collect())
            if isinstance(self.df, pd.DataFrame):
                df = df.values
            if isinstance(self.df, list):
                df = np.array(self.df)
        tf = TemporaryFile()
        tf.seek(0)
        hdf5_file = h5py.File(tf)
        tf.seek(0)
        hdf5_file.create_dataset(self.training_dataset.name, data=df)
        tf.seek(0)
        hdf5_file.close()
        tf.seek(0)
        hdfs.dump(tf.read(), self.path + constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX)
Exemple #4
0
 def init_logger(self, trial_log_file):
     """Initializes the trial log file
     """
     self.trial_log_file = trial_log_file
     # Open trial log file descriptor
     if not hopshdfs.exists(self.trial_log_file):
         hopshdfs.dump("", self.trial_log_file)
     self.trial_fd = hopshdfs.open_file(self.trial_log_file, flags="w")
Exemple #5
0
def _create_experiment_subdirectories(app_id,
                                      run_id,
                                      param_string,
                                      type,
                                      sub_type=None,
                                      params=None):
    """
    Creates directories for an experiment, if Experiments folder exists it will create directories
    below it, otherwise it will create them in the Logs directory.

    Args:
        :app_id: YARN application ID of the experiment
        :run_id: Experiment ID
        :param_string: name of the new directory created under parent directories
        :type: type of the new directory parent, e.g differential_evolution
        :sub_type: type of sub directory to parent, e.g generation
        :params: dict of hyperparameters

    Returns:
        The new directories for the yarn-application and for the execution (hdfs_exec_logdir, hdfs_appid_logdir)
    """

    pyhdfs_handle = hdfs.get()

    hdfs_events_parent_dir = hdfs.project_path() + "Experiments"

    hdfs_experiment_dir = hdfs_events_parent_dir + "/" + app_id + "_" + str(
        run_id)

    # determine directory structure based on arguments
    if sub_type:
        hdfs_exec_logdir = hdfs_experiment_dir + "/" + str(
            sub_type) + '/' + str(param_string)
        if pyhdfs_handle.exists(hdfs_exec_logdir):
            hdfs.delete(hdfs_exec_logdir, recursive=True)
    elif not param_string and not sub_type:
        if pyhdfs_handle.exists(hdfs_experiment_dir):
            hdfs.delete(hdfs_experiment_dir, recursive=True)
        hdfs_exec_logdir = hdfs_experiment_dir + '/'
    else:
        hdfs_exec_logdir = hdfs_experiment_dir + '/' + str(param_string)
        if pyhdfs_handle.exists(hdfs_exec_logdir):
            hdfs.delete(hdfs_exec_logdir, recursive=True)

    # Need to remove directory if it exists (might be a task retry)

    # create the new directory
    pyhdfs_handle.create_directory(hdfs_exec_logdir)

    return_file = hdfs_exec_logdir + '/.hparams.json'
    hdfs.dump(dumps(params), return_file)

    return hdfs_exec_logdir, hdfs_experiment_dir
Exemple #6
0
    def initialize_logger(self, exp_dir):
        """Initialize logger of optimizer

        :param exp_dir: path of experiment directory
        :rtype exp_dir: str
        """

        # configure logger
        self.log_file = exp_dir + "/pruner.log"
        if not hdfs.exists(self.log_file):
            hdfs.dump("", self.log_file)
        self.fd = hdfs.open_file(self.log_file, flags="w")
        self._log("Initialized Pruner Logger")
Exemple #7
0
def _handle_return(retval, hdfs_exec_logdir, optimization_key, logfile):
    """

    Args:
        val:
        hdfs_exec_logdir:

    Returns:

    """

    _upload_file_output(retval, hdfs_exec_logdir)

    # Validation
    if not optimization_key and type(retval) is dict and len(
            retval.keys()) > 1:
        raise Exception(
            'Missing optimization_key argument, when returning multiple values in a dict the optimization_key argument must be set.'
        )
    elif type(retval) is dict and optimization_key not in retval and len(
            retval.keys()) >= 1:
        raise Exception(
            'optimization_key not in returned dict, when returning multiple values in a dict the optimization_key argument must be set to indicate which key the optimization algorithm should maximize or minimize on.'
        )
    elif type(retval) is dict and len(retval.keys()) == 0:
        raise Exception(
            'Returned dict is empty, must contain atleast 1 metric to maximize or minimize.'
        )

    # Validate that optimization_key is a number
    if type(retval) is dict and len(retval.keys()) > 1:
        opt_val = retval[optimization_key]
        opt_val = _validate_optimization_value(opt_val)
        retval[optimization_key] = opt_val
    elif type(retval) is dict and len(retval.keys()) == 1:
        opt_val = retval[list(retval.keys())[0]]
        opt_val = _validate_optimization_value(opt_val)
        retval[list(retval.keys())[0]] = opt_val
    else:
        opt_val = _validate_optimization_value(retval)
        retval = {'metric': opt_val}

    retval['log'] = logfile

    return_file = hdfs_exec_logdir + '/.outputs.json'
    hdfs.dump(dumps(retval), return_file)

    metric_file = hdfs_exec_logdir + '/.metric'
    hdfs.dump(str(opt_val), metric_file)
Exemple #8
0
def _store_tf_record_schema_hdfs(tfrecord_schema, hdfs_path):
    """
    Stores a tfrecord json schema to HDFS

    Args:
        :tfrecord_schema: the tfrecord schema to store
        :hdfs_path: the hdfs path to store it

    Returns:
        None
    """
    json_str = json.dumps(tfrecord_schema)
    hdfs.dump(
        json_str, hdfs_path + constants.DELIMITERS.SLASH_DELIMITER +
        constants.FEATURE_STORE.TRAINING_DATASET_TF_RECORD_SCHEMA_FILE_NAME)
Exemple #9
0
def _finalize_experiment(experiment_json, metric, app_id, run_id, state,
                         duration, logdir, bestLogdir, optimization_key):

    summary_file = _build_summary_json(logdir)

    if summary_file:
        hdfs.dump(summary_file, logdir + '/.summary.json')

    if bestLogdir:
        experiment_json['bestDir'] = bestLogdir[len(hdfs.project_path()):]
    experiment_json['optimizationKey'] = optimization_key
    experiment_json['metric'] = metric
    experiment_json['state'] = state
    experiment_json['duration'] = duration

    _attach_experiment_xattr(app_id, run_id, experiment_json, 'REPLACE')
Exemple #10
0
    def __init__(self, log_file, partition_id, task_attempt, print_executor):
        self.metric = None
        self.lock = threading.RLock()
        self.stop = False
        self.trial_id = None
        self.trial_log_file = None
        self.logs = ""
        self.log_file = log_file
        self.partition_id = partition_id
        self.task_attempt = task_attempt
        self.print_executor = print_executor

        # Open executor log file descriptor
        # This log is for all maggy system related log messages
        if not hopshdfs.exists(log_file):
            hopshdfs.dump("", log_file)
        self.fd = hopshdfs.open_file(log_file, flags="w")
        self.trial_fd = None
Exemple #11
0
def _handle_return_val(return_val, log_dir, optimization_key, log_file):
    """Handles the return value of the user defined training function.
    """
    experiment_utils._upload_file_output(return_val, log_dir)

    # Return type validation
    if not optimization_key:
        raise ValueError("Optimization key cannot be None.")
    if not return_val:
        raise exceptions.ReturnTypeError(optimization_key, return_val)
    if not isinstance(return_val, constants.USER_FCT.RETURN_TYPES):
        raise exceptions.ReturnTypeError(optimization_key, return_val)
    if isinstance(return_val, dict) and optimization_key not in return_val:
        raise KeyError(
            "Returned dictionary does not contain optimization key with the "
            "provided name: {}".format(optimization_key))

    # validate that optimization metric is numeric
    if isinstance(return_val, dict):
        opt_val = return_val[optimization_key]
    else:
        opt_val = return_val
        return_val = {optimization_key: opt_val}

    if not isinstance(opt_val, constants.USER_FCT.NUMERIC_TYPES):
        raise exceptions.MetricTypeError(optimization_key, opt_val)

    # for key, value in return_val.items():
    #    return_val[key] = value if isinstance(value, str) else str(value)

    return_val["log"] = log_file.replace(hopshdfs.project_path(), "")

    return_file = log_dir + "/.outputs.json"
    hopshdfs.dump(json.dumps(return_val, default=json_default_numpy),
                  return_file)

    metric_file = log_dir + "/.metric"
    hopshdfs.dump(json.dumps(opt_val, default=json_default_numpy), metric_file)

    return opt_val
def _handle_return_simple(retval, hdfs_exec_logdir, logfile):
    """

    Args:
        val:
        hdfs_exec_logdir:

    Returns:

    """
    return_file = hdfs_exec_logdir + '/.outputs.json'

    _upload_file_output(retval, hdfs_exec_logdir)

    # Validation
    if type(retval) is not dict:
        try:
            retval = {'metric': retval}
        except:
            pass

    retval['log'] = logfile

    hdfs.dump(dumps(retval), return_file)
Exemple #13
0
 def dump(self, data, hdfs_path):
     return hopshdfs.dump(data, hdfs_path)
def _register(hdfs_exec_dir, endpoint_dir, exec_num, local_logdir=False):
    """

    Args:
        hdfs_exec_dir:
        endpoint_dir:
        exec_num:
        local_logdir:

    Returns:

    """
    global tb_pid

    if tb_pid != 0:
        subprocess.Popen(["kill", str(tb_pid)])

    _reset_global()

    global events_logdir
    events_logdir = hdfs_exec_dir

    global local_logdir_bool
    local_logdir_bool = local_logdir

    if tb_pid == 0:
        global pypath
        pypath = os.getenv("PYSPARK_PYTHON")

        #find free port
        tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        tb_socket.bind(('', 0))
        global tb_port
        tb_addr, tb_port = tb_socket.getsockname()

        global tb_path
        tb_path = experiment_utils._find_tensorboard()

        tb_socket.close()

        tb_env = _init_tb_env()

        global local_logdir_path
        if local_logdir:
            local_logdir_path = os.getcwd() + '/local_logdir'
            if os.path.exists(local_logdir_path):
                shutil.rmtree(local_logdir_path)
                os.makedirs(local_logdir_path)
            else:
                os.makedirs(local_logdir_path)

            local_logdir_path = local_logdir_path + '/'
            tb_proc = subprocess.Popen(
                [
                    pypath, tb_path,
                    "--logdir=%s" % local_logdir_path,
                    "--port=%d" % tb_port,
                    "--host=%s" % "0.0.0.0"
                ],
                env=tb_env,
                preexec_fn=util._on_executor_exit('SIGTERM'))
        else:
            tb_proc = subprocess.Popen(
                [
                    pypath, tb_path,
                    "--logdir=%s" % events_logdir,
                    "--port=%d" % tb_port,
                    "--host=%s" % "0.0.0.0"
                ],
                env=tb_env,
                preexec_fn=util._on_executor_exit('SIGTERM'))

        tb_pid = tb_proc.pid

        host = socket.getfqdn()
        global tb_url
        tb_url = "http://{0}:{1}".format(host, tb_port)
        global endpoint
        endpoint = endpoint_dir + "/TensorBoard.task" + str(exec_num)

        #dump tb host:port to hdfs
    hopshdfs.dump(tb_url, endpoint)

    return endpoint, tb_pid
Exemple #15
0
    def _wrapper_fun(iter):
        """
        Wraps the user supplied training function in order to be passed to the
        Spark Executors.

        Args:
            iter:

        Returns:

        """
        experiment_utils._set_ml_id(app_id, run_id)

        # get task context information to determine executor identifier
        partition_id, task_attempt = util.get_partition_attempt_id()

        client = rpc.Client(server_addr, partition_id, task_attempt,
                            hb_interval, secret)
        log_file = (log_dir + "/executor_" + str(partition_id) + "_" +
                    str(task_attempt) + ".log")

        # save the builtin print
        original_print = __builtin__.print

        reporter = Reporter(log_file, partition_id, task_attempt,
                            original_print)

        def maggy_print(*args, **kwargs):
            """Maggy custom print() function."""
            original_print(*args, **kwargs)
            reporter.log(" ".join(str(x) for x in args), True)

        # override the builtin print
        __builtin__.print = maggy_print

        try:
            client_addr = client.client_addr

            host_port = client_addr[0] + ":" + str(client_addr[1])

            exec_spec = {}
            exec_spec["partition_id"] = partition_id
            exec_spec["task_attempt"] = task_attempt
            exec_spec["host_port"] = host_port
            exec_spec["trial_id"] = None

            reporter.log("Registering with experiment driver", False)
            client.register(exec_spec)

            client.start_heartbeat(reporter)

            # blocking
            trial_id, parameters = client.get_suggestion(reporter)

            while not client.done:
                if experiment_type == "ablation":
                    ablation_params = {
                        "ablated_feature":
                        parameters.get("ablated_feature", "None"),
                        "ablated_layer":
                        parameters.get("ablated_layer", "None"),
                    }
                    parameters.pop("ablated_feature")
                    parameters.pop("ablated_layer")

                tb_logdir = log_dir + "/" + trial_id
                trial_log_file = tb_logdir + "/output.log"
                reporter.set_trial_id(trial_id)

                # If trial is repeated, delete trial directory, except log file
                if hopshdfs.exists(tb_logdir):
                    util._clean_dir(tb_logdir, [trial_log_file])
                else:
                    hopshdfs.mkdir(tb_logdir)

                reporter.init_logger(trial_log_file)
                tensorboard._register(tb_logdir)
                if experiment_type == "ablation":
                    hopshdfs.dump(
                        json.dumps(ablation_params,
                                   default=util.json_default_numpy),
                        tb_logdir + "/.hparams.json",
                    )

                else:
                    hopshdfs.dump(
                        json.dumps(parameters,
                                   default=util.json_default_numpy),
                        tb_logdir + "/.hparams.json",
                    )

                try:
                    reporter.log("Starting Trial: {}".format(trial_id), False)
                    reporter.log("Trial Configuration: {}".format(parameters),
                                 False)

                    if experiment_type == "optimization":
                        tensorboard._write_hparams(parameters, trial_id)

                    sig = inspect.signature(map_fun)
                    if sig.parameters.get("reporter", None):
                        retval = map_fun(**parameters, reporter=reporter)
                    else:
                        retval = map_fun(**parameters)

                    if experiment_type == "optimization":
                        tensorboard._write_session_end()

                    retval = util._handle_return_val(retval, tb_logdir,
                                                     optimization_key,
                                                     trial_log_file)

                except exceptions.EarlyStopException as e:
                    retval = e.metric
                    reporter.log("Early Stopped Trial.", False)

                reporter.log("Finished Trial: {}".format(trial_id), False)
                reporter.log("Final Metric: {}".format(retval), False)
                client.finalize_metric(retval, reporter)

                # blocking
                trial_id, parameters = client.get_suggestion(reporter)

        except:  # noqa: E722
            reporter.log(traceback.format_exc(), False)
            raise
        finally:
            reporter.close_logger()
            client.stop()
            client.close()
Exemple #16
0
    def __init__(self, experiment_type, **kwargs):

        global driver_secret

        # COMMON EXPERIMENT SETUP
        self._final_store = []
        self._trial_store = {}
        self.num_executors = kwargs.get("num_executors")
        self._message_q = queue.Queue()
        self.name = kwargs.get("name")
        self.experiment_done = False
        self.worker_done = False
        self.hb_interval = kwargs.get("hb_interval")
        self.description = kwargs.get("description")
        self.experiment_type = experiment_type
        self.es_interval = kwargs.get("es_interval")
        self.es_min = kwargs.get("es_min")

        # TYPE-SPECIFIC EXPERIMENT SETUP
        if self.experiment_type == "optimization":
            # set up an optimization experiment

            self.num_trials = kwargs.get("num_trials", 1)

            searchspace = kwargs.get("searchspace")
            if isinstance(searchspace, Searchspace):
                self.searchspace = searchspace
            elif searchspace is None:
                self.searchspace = Searchspace()
            else:
                raise Exception(
                    "The experiment's search space should be an instance of maggy.Searchspace, "
                    "but it is {0} (of type '{1}').".format(
                        str(searchspace),
                        type(searchspace).__name__))

            optimizer = kwargs.get("optimizer")

            if optimizer is None:
                if len(self.searchspace.names()) == 0:
                    self.optimizer = SingleRun()
                else:
                    raise Exception(
                        "Searchspace has to be empty or None to use without optimizer"
                    )
            elif isinstance(optimizer, str):
                if optimizer.lower() == "randomsearch":
                    self.optimizer = RandomSearch()
                elif optimizer.lower() == "asha":
                    self.optimizer = Asha()
                elif optimizer.lower() == "none":
                    if len(self.searchspace.names()) == 0:
                        self.optimizer = SingleRun()
                    else:
                        raise Exception(
                            "Searchspace has to be empty or None to use without Optimizer."
                        )
                else:
                    raise Exception(
                        "Unknown Optimizer. Can't initialize experiment driver."
                    )
            elif isinstance(optimizer, AbstractOptimizer):
                self.optimizer = optimizer
                print("Custom Optimizer initialized.")
            else:
                raise Exception(
                    "The experiment's optimizer should either be an string indicating the name "
                    "of an implemented optimizer (such as 'randomsearch') or an instance of "
                    "maggy.optimizer.AbstractOptimizer, "
                    "but it is {0} (of type '{1}').".format(
                        str(optimizer),
                        type(optimizer).__name__))

            direction = kwargs.get("direction", "max")
            if isinstance(direction,
                          str) and direction.lower() in ["min", "max"]:
                self.direction = direction.lower()
            else:
                raise Exception(
                    "The experiment's direction should be an string (either 'min' or 'max') "
                    "but it is {0} (of type '{1}').".format(
                        str(direction),
                        type(direction).__name__))

            es_policy = kwargs.get("es_policy")
            if isinstance(es_policy, str):
                if es_policy.lower() == "median":
                    self.earlystop_check = MedianStoppingRule.earlystop_check
                elif es_policy.lower() == "none":
                    self.earlystop_check = NoStoppingRule.earlystop_check
                else:
                    raise Exception(
                        "The experiment's early stopping policy should either be a string ('median' or 'none') "
                        "or a custom policy that is an instance of maggy.earlystop.AbstractEarlyStop, "
                        "but it is {0} (of type '{1}').".format(
                            str(es_policy),
                            type(es_policy).__name__))
            elif isinstance(es_policy, AbstractEarlyStop):
                self.earlystop_check = es_policy.earlystop_check
                print("Custom Early Stopping policy initialized.")
            else:
                raise Exception(
                    "The experiment's early stopping policy should either be a string ('median' or 'none') "
                    "or a custom policy that is an instance of maggy.earlystop.AbstractEarlyStop, "
                    "but it is {0} (of type '{1}').".format(
                        str(es_policy),
                        type(es_policy).__name__))

            self.es_interval = kwargs.get("es_interval")
            self.es_min = kwargs.get("es_min")

            self.result = {
                "best_val": "n.a.",
                "num_trials": 0,
                "early_stopped": 0
            }

        elif self.experiment_type == "ablation":
            # set up an ablation study experiment
            self.earlystop_check = NoStoppingRule.earlystop_check

            ablation_study = kwargs.get("ablation_study")
            if isinstance(ablation_study, AblationStudy):
                self.ablation_study = ablation_study
            else:
                raise Exception(
                    "The experiment's ablation study configuration should be an instance of "
                    "maggy.ablation.AblationStudy, "
                    "but it is {0} (of type '{1}').".format(
                        str(ablation_study),
                        type(ablation_study).__name__))

            searchspace = kwargs.get("searchspace")
            if not searchspace:
                self.searchspace = Searchspace()
            else:
                raise Exception(
                    "The experiment's search space should be None for ablation experiments, "
                    "but it is {0} (of type '{1}').".format(
                        str(searchspace),
                        type(searchspace).__name__))

            ablator = kwargs.get("ablator")
            if isinstance(ablator, str):
                if ablator.lower() == "loco":
                    self.ablator = LOCO(ablation_study, self._final_store)
                    self.num_trials = self.ablator.get_number_of_trials()
                    if self.num_executors > self.num_trials:
                        self.num_executors = self.num_trials
                else:
                    raise Exception(
                        "The experiment's ablation study policy should either be a string ('loco') "
                        "or a custom policy that is an instance of maggy.ablation.ablation.AbstractAblator, "
                        "but it is {0} (of type '{1}').".format(
                            str(ablator),
                            type(ablator).__name__))
            elif isinstance(ablator, AbstractAblator):
                self.ablator = ablator
                print("Custom Ablator initialized. \n")
            else:
                raise Exception(
                    "The experiment's ablation study policy should either be a string ('loco') "
                    "or a custom policy that is an instance of maggy.ablation.ablation.AbstractAblator, "
                    "but it is {0} (of type '{1}').".format(
                        str(ablator),
                        type(ablator).__name__))

            self.result = {
                "best_val": "n.a.",
                "num_trials": 0,
                "early_stopped": "n.a"
            }
        else:
            raise Exception(
                "Unknown experiment type. experiment_type should be either 'optimization' or 'ablation', "
                "but it is {0}.".format(str(self.experiment_type)))

        # FINALIZE EXPERIMENT SETUP
        self.server = rpc.Server(self.num_executors)
        if not driver_secret:
            driver_secret = self._generate_secret(
                ExperimentDriver.SECRET_BYTES)
        self._secret = driver_secret
        self.job_start = datetime.now()
        self.executor_logs = ""
        self.maggy_log = ""
        self.log_lock = threading.RLock()
        self.log_file = kwargs.get("log_dir") + "/maggy.log"
        self.log_dir = kwargs.get("log_dir")
        self.exception = None

        # Open File desc for HDFS to log
        if not hopshdfs.exists(self.log_file):
            hopshdfs.dump("", self.log_file)
        self.fd = hopshdfs.open_file(self.log_file, flags="w")
Exemple #17
0
        def _target_function(self):

            try:
                time_earlystop_check = (
                    time.time()
                )  # only used by earlystop-supporting experiments

                while not self.worker_done:
                    trial = None
                    # get a message
                    try:
                        msg = self._message_q.get_nowait()
                    except queue.Empty:
                        msg = {"type": None}

                    if self.earlystop_check != NoStoppingRule.earlystop_check:
                        if (time.time() -
                                time_earlystop_check) >= self.es_interval:
                            time_earlystop_check = time.time()

                            # pass currently running trials to early stop component
                            if len(self._final_store) > self.es_min:
                                self._log("Check for early stopping.")
                                try:
                                    to_stop = self.earlystop_check(
                                        self._trial_store,
                                        self._final_store,
                                        self.direction,
                                    )
                                except Exception as e:
                                    self._log(e)
                                    to_stop = []
                                if len(to_stop) > 0:
                                    self._log(
                                        "Trials to stop: {}".format(to_stop))
                                for trial_id in to_stop:
                                    self.get_trial(trial_id).set_early_stop()

                    # depending on message do the work
                    # 1. METRIC
                    if msg["type"] == "METRIC":
                        # append executor logs if in the message
                        logs = msg.get("logs", None)
                        if logs is not None:
                            with self.log_lock:
                                self.executor_logs = self.executor_logs + logs

                        if msg["trial_id"] is not None and msg[
                                "data"] is not None:
                            self.get_trial(msg["trial_id"]).append_metric(
                                msg["data"])

                    # 2. BLACKLIST the trial
                    elif msg["type"] == "BLACK":
                        trial = self.get_trial(msg["trial_id"])
                        with trial.lock:
                            trial.status = Trial.SCHEDULED
                            self.server.reservations.assign_trial(
                                msg["partition_id"], msg["trial_id"])

                    # 3. FINAL
                    elif msg["type"] == "FINAL":
                        # set status
                        # get trial only once
                        trial = self.get_trial(msg["trial_id"])

                        logs = msg.get("logs", None)
                        if logs is not None:
                            with self.log_lock:
                                self.executor_logs = self.executor_logs + logs

                        # finalize the trial object
                        with trial.lock:
                            trial.status = Trial.FINALIZED
                            trial.final_metric = msg["data"]
                            trial.duration = experiment_utils._seconds_to_milliseconds(
                                time.time() - trial.start)

                        # move trial to the finalized ones
                        self._final_store.append(trial)
                        self._trial_store.pop(trial.trial_id)

                        # update result dictionary
                        self._update_result(trial)
                        # keep for later in case tqdm doesn't work
                        self.maggy_log = self._update_maggy_log()
                        self._log(self.maggy_log)

                        hopshdfs.dump(
                            trial.to_json(),
                            self.log_dir + "/" + trial.trial_id +
                            "/trial.json",
                        )

                        # assign new trial
                        if self.experiment_type == "optimization":
                            trial = self.optimizer.get_suggestion(trial)
                        elif self.experiment_type == "ablation":
                            trial = self.ablator.get_trial(trial)
                        if trial is None:
                            self.server.reservations.assign_trial(
                                msg["partition_id"], None)
                            self.experiment_done = True
                        elif trial == "IDLE":
                            self.add_message({
                                "type":
                                "IDLE",
                                "partition_id":
                                msg["partition_id"],
                                "idle_start":
                                time.time(),
                            })
                            self.server.reservations.assign_trial(
                                msg["partition_id"], None)
                        else:
                            with trial.lock:
                                trial.start = time.time()
                                trial.status = Trial.SCHEDULED
                                self.server.reservations.assign_trial(
                                    msg["partition_id"], trial.trial_id)
                                self.add_trial(trial)

                    # 4. Let executor be idle
                    elif msg["type"] == "IDLE":
                        # execute only every 0.1 seconds but do not block thread
                        if (self.experiment_type == "optimization"
                                and time.time() - msg["idle_start"] > 0.1):
                            trial = self.optimizer.get_suggestion()
                            if trial is None:
                                self.server.reservations.assign_trial(
                                    msg["partition_id"], None)
                                self.experiment_done = True
                            elif trial == "IDLE":
                                # reset timeout
                                msg["idle_start"] = time.time()
                                self.add_message(msg)
                            else:
                                with trial.lock:
                                    trial.start = time.time()
                                    trial.status = Trial.SCHEDULED
                                    self.server.reservations.assign_trial(
                                        msg["partition_id"], trial.trial_id)
                                    self.add_trial(trial)
                        elif self.experiment_type == "optimization":
                            self.add_message(msg)

                    # 4. REG
                    elif msg["type"] == "REG":
                        if self.experiment_type == "optimization":
                            trial = self.optimizer.get_suggestion()
                        elif self.experiment_type == "ablation":
                            trial = self.ablator.get_trial()
                        if trial is None:
                            self.server.reservations.assign_trial(
                                msg["partition_id"], None)
                            self.experiment_done = True
                        elif trial == "IDLE":
                            # reset timeout
                            msg["idle_start"] = time.time()
                            self.add_message(msg)
                        else:
                            with trial.lock:
                                trial.start = time.time()
                                trial.status = Trial.SCHEDULED
                                self.server.reservations.assign_trial(
                                    msg["partition_id"], trial.trial_id)
                                self.add_trial(trial)
            except Exception as exc:
                # Exception can't be propagated to parent thread
                # therefore log the exception and fail experiment
                self._log(exc)
                self.exception = exc
                self.server.stop()
Exemple #18
0
    def finalize(self, job_end):

        results = ""

        if self.experiment_type == "optimization":

            _ = self.optimizer.finalize_experiment(self._final_store)

            self.job_end = job_end

            self.duration = experiment_utils._seconds_to_milliseconds(
                self.job_end - self.job_start)

            self.duration_str = experiment_utils._time_diff(
                self.job_start, self.job_end)

            results = ("\n------ " + self.optimizer.name() +
                       " Results ------ direction(" + self.direction + ") \n"
                       "BEST combination " +
                       json.dumps(self.result["best_hp"]) + " -- metric " +
                       str(self.result["best_val"]) + "\n"
                       "WORST combination " +
                       json.dumps(self.result["worst_hp"]) + " -- metric " +
                       str(self.result["worst_val"]) + "\n"
                       "AVERAGE metric -- " + str(self.result["avg"]) + "\n"
                       "EARLY STOPPED Trials -- " +
                       str(self.result["early_stopped"]) + "\n"
                       "Total job time " + self.duration_str + "\n")

        elif self.experiment_type == "ablation":

            _ = self.ablator.finalize_experiment(self._final_store)
            self.job_end = job_end

            self.duration = experiment_utils._seconds_to_milliseconds(
                self.job_end - self.job_start)

            self.duration_str = experiment_utils._time_diff(
                self.job_start, self.job_end)

            results = ("\n------ " + self.ablator.name() +
                       " Results ------ \n" + "BEST Config Excludes " +
                       json.dumps(self.result["best_config"]) + " -- metric " +
                       str(self.result["best_val"]) + "\n" +
                       "WORST Config Excludes " +
                       json.dumps(self.result["worst_config"]) +
                       " -- metric " + str(self.result["worst_val"]) + "\n" +
                       "AVERAGE metric -- " + str(self.result["avg"]) + "\n" +
                       "Total Job Time " + self.duration_str + "\n")

        print(results)

        self._log(results)

        hopshdfs.dump(
            json.dumps(self.result, default=util.json_default_numpy),
            self.log_dir + "/result.json",
        )
        sc = hopsutil._find_spark().sparkContext
        hopshdfs.dump(self.json(sc), self.log_dir + "/maggy.json")

        return self.result