Ejemplo n.º 1
0
    def read_featureframe(self, spark):
        """
        Reads a training dataset in petastorm format from HopsFS

        Args:
            :spark: the spark session

        Returns:
            dataframe with the data of the training dataset

        Raises:
              :TrainingDatasetNotFound: if the requested training dataset could not be found
        """
        if hasattr(self, 'training_dataset') and \
            self.training_dataset.training_dataset_type != constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            if hdfs.exists(self.path):
                spark_df = spark.read.parquet(self.path)
            elif hdfs.exists(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX):
                spark_df = spark.read.parquet(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX)
            if not hdfs.exists(self.path) and not hdfs.exists(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX):
                raise TrainingDatasetNotFound(
                    "Could not find a training dataset in folder {} "
                    "or in file {}".format(
                        self.path, self.path + constants.FEATURE_STORE.
                        TRAINING_DATASET_PETASTORM_SUFFIX))
        else:
            spark_df = spark.read.parquet(self.path)
        return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)
Ejemplo n.º 2
0
    def read_featureframe(self, spark):
        """
        Reads a training dataset in CSV format from HopsFS

        Args:
            :spark: the spark session

        Returns:
            dataframe with the data of the training dataset

        Raises:
              :TrainingDatasetNotFound: if the requested training dataset could not be found
        """
        if self.training_dataset.training_dataset_type != constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            if hdfs.exists(self.path):
                spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option(
                    constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option(
                    constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER,
                    constants.DELIMITERS.COMMA_DELIMITER).load(self.path)
            elif hdfs.exists(self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX):
                spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option(
                    constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option(
                    constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER,
                    constants.DELIMITERS.COMMA_DELIMITER).load(self.path +
                                                               constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX)
            if not hdfs.exists(self.path) and not hdfs.exists(self.path +
                                                                      constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX):
                raise TrainingDatasetNotFound("Could not find a training dataset in folder {} or in file {}".format(
                    self.path, self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX))
        else:
            spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option(
                constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option(
                constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER,
                constants.DELIMITERS.COMMA_DELIMITER).load(self.path)
        return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)
Ejemplo n.º 3
0
    def read_featureframe(self, spark):
        """
        Reads a training dataset in tfrecords format from HopsFS

        Args:
            :spark: the spark session

        Returns:
            dataframe with the data of the training dataset

        Raises:
              :TrainingDatasetNotFound: if the requested training dataset could not be found
        """
        if hasattr(self, 'training_dataset') and self.training_dataset.training_dataset_type != \
                constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            if hdfs.exists(self.path):
                spark_df = spark.read.format(
                    constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT
                ).option(
                    constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE,
                    constants.SPARK_CONFIG.
                    SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE).load(self.path)
            elif hdfs.exists(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX):
                spark_df = spark.read.format(
                    constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT
                ).option(
                    constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE,
                    constants.SPARK_CONFIG.
                    SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE
                ).load(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX)
            if not hdfs.exists(self.path) and not hdfs.exists(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX):
                raise TrainingDatasetNotFound(
                    "Could not find a training dataset in folder {} or in file {}"
                    .format(
                        self.path, self.path + constants.FEATURE_STORE.
                        TRAINING_DATASET_TFRECORDS_SUFFIX))
        else:
            spark_df = spark.read.format(
                constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT
            ).option(
                constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE,
                constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE
            ).load(self.path)
        return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)
Ejemplo n.º 4
0
def _version_resources(versioned_resources, rundir):
    """

    Args:
        versioned_resources:
        rundir:

    Returns:

    """
    if not versioned_resources:
        return None
    pyhdfs_handle = hdfs.get()
    pyhdfs_handle.create_directory(rundir)
    endpoint_prefix = hdfs.project_path()
    versioned_paths = []
    for hdfs_resource in versioned_resources:
        if pydoop.hdfs.path.exists(hdfs_resource):
            log("Versoning resource '%s' in rundir '%s'" %
                (hdfs_resource, rundir))

            # Remove the file if it exists
            target_path = os.path.join(rundir, os.path.basename(hdfs_resource))
            if hdfs.exists(target_path):
                hdfs.rmr(target_path)

            hdfs.cp(hdfs_resource, rundir)
            path, filename = os.path.split(hdfs_resource)
            versioned_paths.append(
                rundir.replace(endpoint_prefix, '') + '/' + filename)
        else:
            log("Resource not found '%s'" % hdfs_resource, level='warning')
            #raise Exception('Could not find resource in specified path: ' + hdfs_resource)

    return ', '.join(versioned_paths)
Ejemplo n.º 5
0
def _validate_user_serving_input(model_path, model_name, serving_type,
                                 model_version, batching_enabled,
                                 num_partitions, num_replicas, instances):
    """
    Validate user input on the client side before sending REST call to Hopsworks (additional validation will be done
    in the backend)

    Args:
        :model_path: path to the model or artifact being served
        :model_name: the name of the serving to create
        :serving_type: the type of serving
        :model_version: version of the serving
        :batching_enabled: boolean flag whether to enable batching for inference requests to the serving
        :num_partitions: kafka partitions
        :num_replicas: kafka replicas
        :instances: the number of serving instances (the more instances the more inference requests can
                    be served in parallel)

    Returns:
        None

    Raises:
        :ValueError: if the serving input failed the validation
    """
    name_pattern = re.compile("^[a-zA-Z0-9]+$")
    if len(model_name) > 256 or model_name == "" or not name_pattern.match(
            model_name):
        raise ValueError(
            "Name of serving cannot be empty, cannot exceed 256 characters and must match the regular "
            "expression: ^[a-zA-Z0-9]+$, the provided name: {} is not valid".
            format(model_name))
    if not hdfs.exists(model_path):
        raise ValueError(
            "The model/artifact path must exist in HDFS, the provided path: {} "
            "does not exist".format(model_path))
    if serving_type not in constants.MODEL_SERVING.SERVING_TYPES:
        raise ValueError(
            "The provided serving_type: {} is not supported, supported "
            "serving types are: {}".format(
                serving_type, ",".join(constants.MODEL_SERVING.SERVING_TYPES)))
    if not isinstance(model_version, int):
        raise ValueError(
            "The model version must be an integer, the provided version is not: {}"
            .format(model_version))
    if serving_type == constants.MODEL_SERVING.SERVING_TYPE_TENSORFLOW:
        if not isinstance(num_replicas, int):
            raise ValueError(
                "Number of kafka topic replicas must be an integer, the provided num replicas "
                "is not: {}".format(model_version))
        if not isinstance(num_partitions, int):
            raise ValueError(
                "Number of kafka topic partitions must be an integer, the provided num partitions "
                "is not: {}".format(num_partitions))
        if not isinstance(batching_enabled, bool):
            raise ValueError(
                "Batching enabled must be a boolean, the provided value "
                "is not: {}".format(batching_enabled))
    if not isinstance(instances, int):
        raise ValueError("The number of serving instances must be an integer, "
                         "the provided version is not: {}".format(instances))
Ejemplo n.º 6
0
def _create_experiment_dir(app_id, run_id):
    experiment_path = _get_logdir(app_id, run_id)

    if hdfs.exists(experiment_path):
        hdfs.delete(experiment_path, recursive=True)

    hdfs.mkdir(experiment_path)
Ejemplo n.º 7
0
 def init_logger(self, trial_log_file):
     """Initializes the trial log file
     """
     self.trial_log_file = trial_log_file
     # Open trial log file descriptor
     if not hopshdfs.exists(self.trial_log_file):
         hopshdfs.dump("", self.trial_log_file)
     self.trial_fd = hopshdfs.open_file(self.trial_log_file, flags="w")
Ejemplo n.º 8
0
    def read_featureframe(self, spark):
        """
        Reads a training dataset in hdf5 format from HopsFS

        Args:
            :spark: the spark session

        Returns:
            dataframe with the data of the training dataset

        Raises:
              :TrainingDatasetNotFound: if the requested training dataset could not be found
              :CouldNotConvertDataframe: if the hdf5 dataset could not be converted to a spark dataframe
              :HDF5DatasetFormatNotSupportedForExternalTrainingDatasets: if the user tries to read an
                                                                          external training dataset in the .hdf5 format.
        """
        if not hasattr(self, 'training_dataset') or \
                        self.training_dataset.training_dataset_type \
                        == constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE:
            raise HDF5DatasetFormatNotSupportedForExternalTrainingDatasets(
                "The .hdf5 dataset format is not "
                "supported for external training datasets.")
        if not hdfs.exists(
                self.path +
                constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX):
            raise TrainingDatasetNotFound(
                "Could not find a training dataset in file {}".format(
                    self.path +
                    constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX))
        tf = TemporaryFile()
        data = hdfs.load(self.path +
                         constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX)
        tf.write(data)
        tf.seek(0)
        hdf5_file = h5py.File(tf)
        np_array = hdf5_file[self.training_dataset.name][()]
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_NUMPY:
            return np_array
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PYTHON:
            return np_array.tolist()
        if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_SPARK \
                or self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PANDAS:
            if np_array.ndim != 2:
                raise CouldNotConvertDataframe(
                    "Cannot convert numpy array that do not have two dimensions to a dataframe. "
                    "The number of dimensions are: {}".format(np_array.ndim))
            num_cols = np_array.shape[1]
            dataframe_dict = {}
            for n_col in list(range(num_cols)):
                col_name = "col_" + str(n_col)
                dataframe_dict[col_name] = np_array[:, n_col]
            pandas_df = pd.DataFrame(dataframe_dict)
            sc = spark.sparkContext
            sql_context = SQLContext(sc)
            return fs_utils._return_dataframe_type(
                sql_context.createDataFrame(pandas_df), self.dataframe_type)
Ejemplo n.º 9
0
def _build_summary_json(logdir):
    """Builds the summary json to be read by the experiments service.
    """
    combinations = []

    for trial in hopshdfs.ls(logdir):
        if hopshdfs.isdir(trial):
            return_file = trial + "/.outputs.json"
            hparams_file = trial + "/.hparams.json"
            if hopshdfs.exists(return_file) and hopshdfs.exists(hparams_file):
                metric_arr = experiment_utils._convert_return_file_to_arr(
                    return_file)
                hparams_dict = _load_hparams(hparams_file)
                combinations.append({
                    "parameters": hparams_dict,
                    "outputs": metric_arr
                })

    return json.dumps({"combinations": combinations},
                      default=json_default_numpy)
Ejemplo n.º 10
0
def _get_experiments_dir():
    """
    Gets the root folder where the experiments are writing their results

    Returns:
        The folder where the experiments are writing results
    """
    assert hdfs.exists(
        hdfs.project_path() + "Experiments"
    ), "Your project is missing a dataset named Experiments, please create it."
    return hdfs.project_path() + "Experiments"
Ejemplo n.º 11
0
def _upload_file_output(retval, hdfs_exec_logdir):
    if type(retval) is dict:
        for metric_key in retval.keys():
            value = str(retval[metric_key])
            if '/' in value or os.path.exists(os.getcwd() + '/' + value):
                if os.path.exists(value):  # absolute path
                    if hdfs.exists(hdfs_exec_logdir + '/' +
                                   value.split('/')[-1]):
                        hdfs.delete(hdfs_exec_logdir + '/' +
                                    value.split('/')[-1],
                                    recursive=False)
                    pydoop.hdfs.put(value, hdfs_exec_logdir)
                    os.remove(value)
                    hdfs_exec_logdir = hdfs.abs_path(hdfs_exec_logdir)
                    retval[metric_key] = hdfs_exec_logdir[
                        len(hdfs.abs_path(hdfs.project_path())
                            ):] + '/' + value.split('/')[-1]
                elif os.path.exists(os.getcwd() + '/' +
                                    value):  # relative path
                    output_file = os.getcwd() + '/' + value
                    if hdfs.exists(hdfs_exec_logdir + '/' + value):
                        hdfs.delete(hdfs_exec_logdir + '/' + value,
                                    recursive=False)
                    pydoop.hdfs.put(value, hdfs_exec_logdir)
                    os.remove(output_file)
                    hdfs_exec_logdir = hdfs.abs_path(hdfs_exec_logdir)
                    retval[metric_key] = hdfs_exec_logdir[
                        len(hdfs.abs_path(hdfs.project_path())
                            ):] + '/' + output_file.split('/')[-1]
                elif value.startswith('Experiments') and value.endswith(
                        'output.log'):
                    continue
                elif value.startswith('Experiments') and hdfs.exists(
                        hdfs.project_path() + '/' + value):
                    hdfs.cp(hdfs.project_path() + '/' + value,
                            hdfs_exec_logdir)
                else:
                    raise Exception(
                        'Could not find file or directory on path ' +
                        str(value))
Ejemplo n.º 12
0
def _run(sc,
         train_fn,
         run_id,
         local_logdir=False,
         name="no-name",
         evaluator=False):
    """

    Args:
        sc:
        train_fn:
        local_logdir:
        name:

    Returns:

    """
    app_id = str(sc.applicationId)

    num_executions = util.num_executors()

    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Make SparkUI intuitive by grouping jobs
    sc.setJobGroup(
        os.environ['ML_ID'],
        "{} | ParameterServerStrategy - Distributed Training".format(name))

    server = parameter_server_reservation.Server(num_executions)

    server_addr = server.start()

    num_ps = util.num_param_servers()

    #Force execution on executor, since GPU is located on executor
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, train_fn, local_logdir, server_addr,
                      num_ps, evaluator))

    logdir = experiment_utils._get_logdir(app_id, run_id)

    print('Finished Experiment \n')

    path_to_return = logdir + '/.outputs.json'
    if hdfs.exists(path_to_return):
        with hdfs.open_file(path_to_return, flags="r") as fi:
            contents = fi.read()
            fi.close()
            return logdir, json.loads(contents)

    return logdir, None
Ejemplo n.º 13
0
    def initialize_logger(self, exp_dir):
        """Initialize logger of optimizer

        :param exp_dir: path of experiment directory
        :rtype exp_dir: str
        """

        # configure logger
        self.log_file = exp_dir + "/pruner.log"
        if not hdfs.exists(self.log_file):
            hdfs.dump("", self.log_file)
        self.fd = hdfs.open_file(self.log_file, flags="w")
        self._log("Initialized Pruner Logger")
Ejemplo n.º 14
0
    def __init__(self, log_file, partition_id, task_attempt, print_executor):
        self.metric = None
        self.lock = threading.RLock()
        self.stop = False
        self.trial_id = None
        self.trial_log_file = None
        self.logs = ""
        self.log_file = log_file
        self.partition_id = partition_id
        self.task_attempt = task_attempt
        self.print_executor = print_executor

        # Open executor log file descriptor
        # This log is for all maggy system related log messages
        if not hopshdfs.exists(log_file):
            hopshdfs.dump("", log_file)
        self.fd = hopshdfs.open_file(log_file, flags="w")
        self.trial_fd = None
Ejemplo n.º 15
0
def _build_summary_json(logdir):

    combinations = []
    return_files = []
    hp_arr = None
    output_arr = None

    for experiment_dir in hdfs.ls(logdir):
        runs = hdfs.ls(experiment_dir, recursive=True)
        for run in runs:
            if run.endswith('.outputs.json'):
                return_files.append(run)

    for return_file in return_files:
        output_arr = _convert_return_file_to_arr(return_file)
        param_file = return_file.replace('outputs.json', 'hparams.json')
        if hdfs.exists(param_file):
            hp_arr = _convert_param_to_arr(param_file)
        combinations.append({'parameters': hp_arr, 'outputs': output_arr})

    return dumps({'combinations': combinations})
Ejemplo n.º 16
0
def _validate_user_serving_input(
        serving_name, model_path, model_version, artifact_version, transformer,
        model_server, kfserving, batching_enabled, topic_name, num_partitions,
        num_replicas, inference_logging, instances, transformer_instances,
        predictor_resource_config):
    """
    Validate user input on the client side before sending REST call to Hopsworks (additional validation will be done
    in the backend)

    Args:
        :serving_name: the name of the serving to create
        :model_path: path to the model or artifact being served
        :model_version: version of the model to serve
        :artifact_version: version of the artifact to serve
        :transformer: path to the transformer script
        :model_server: name of the model server to deploy, e.g "TENSORFLOW_SERVING" or "FLASK"
        :kfserving: boolean flag whether to serve the model using KFServing serving tool
        :batching_enabled: boolean flag whether to enable batching for inference requests to the serving
        :topic_name: name of the kafka topic for inference logging, e.g "CREATE" to create a new one, "NONE" to not use kafka topic or an existent topic name
        :num_partitions: if a new kafka topic is to created, number of partitions of the new topic
        :num_replicas: if a new kafka topic is to created, replication factor of the new topic
        :inference_logging: inference data to log into the Kafka topic, e.g "MODEL_INPUTS", "PREDICTIONS" or "ALL"
        :instances: the number of serving instances (the more instances the more inference requests can
        be served in parallel)
        :transformer_instances: the number of transformer instances (the more instances the more inference requests can
        be served in parallel)
        :predictor_resource_config: dict for setting resource configuration parameters required to serve the model, for
        example {'memory': 2048, 'cores': 1, 'gpus': 0}. Currently only supported if Hopsworks is deployed with Kubernetes installed.

    Returns:
        None

    Raises:
        :ValueError: if the serving input failed the validation
    """
    name_pattern = re.compile("^[a-zA-Z0-9]+$")
    if len(serving_name) > 256 or serving_name == "" or not name_pattern.match(
            serving_name):
        raise ValueError(
            "Name of serving cannot be empty, cannot exceed 256 characters and must match the regular "
            "expression: ^[a-zA-Z0-9]+$, the provided name: {} is not valid".
            format(serving_name))
    if not hdfs.exists(model_path):
        raise ValueError(
            "The model/artifact path must exist in HDFS, the provided path: {} "
            "does not exist".format(model_path))
    if model_server not in constants.MODEL_SERVING.MODEL_SERVERS:
        raise ValueError(
            "The provided model_server: {} is not supported, supported "
            "model servers are: {}".format(
                model_server, ",".join(constants.MODEL_SERVING.MODEL_SERVERS)))
    if inference_logging is not None and inference_logging not in constants.MODEL_SERVING.INFERENCE_LOGGING_MODES:
        raise ValueError(
            "The provided inference_logging: {} is not supported, supported "
            "inference logging modes are: {}".format(
                inference_logging,
                ",".join(constants.MODEL_SERVING.INFERENCE_LOGGING_MODES)))
    if not isinstance(model_version, int):
        raise ValueError(
            "The model version must be an integer, the provided version is not: {}"
            .format(model_version))
    if model_server == constants.MODEL_SERVING.MODEL_SERVER_TENSORFLOW_SERVING:
        if not isinstance(num_replicas, int):
            raise ValueError(
                "Number of kafka topic replicas must be an integer, the provided num replicas "
                "is not: {}".format(model_version))
        if not isinstance(num_partitions, int):
            raise ValueError(
                "Number of kafka topic partitions must be an integer, the provided num partitions "
                "is not: {}".format(num_partitions))
        if not isinstance(batching_enabled, bool):
            raise ValueError(
                "Batching enabled must be a boolean, the provided value "
                "is not: {}".format(batching_enabled))
        if kfserving and batching_enabled:
            raise ValueError(
                "Batching requests is currently not supported in KFServing deployments"
            )

    if kfserving and model_server == constants.MODEL_SERVING.MODEL_SERVER_FLASK:
        raise ValueError(
            "Flask is currently not supported for KFServing deployments")
    if not isinstance(instances, int):
        raise ValueError("The number of serving instances must be an integer, "
                         "the provided version is not: {}".format(instances))

    if not kfserving:
        if inference_logging is not None and inference_logging != constants.MODEL_SERVING.INFERENCE_LOGGING_ALL:
            raise ValueError(
                "Fine-grained inference logging is only supported in KFServing deployments"
            )
        if topic_name is not None and topic_name != "NONE" and inference_logging != constants.MODEL_SERVING.INFERENCE_LOGGING_ALL:
            raise ValueError(
                "Inference logging mode 'ALL' is the only mode supported for non-KFServing deployments"
            )

    if kfserving:
        if topic_name is not None and topic_name != "NONE" and inference_logging is None:
            raise ValueError(
                "Inference logging must be defined. Supported inference "
                "logging modes are: {}".format(",".join(
                    constants.MODEL_SERVING.INFERENCE_LOGGING_MODES)))

    if predictor_resource_config is not None:
        if type(predictor_resource_config) is not dict:
            raise ValueError("predictor_resource_config must be a dict.")
        if 'memory' not in predictor_resource_config or 'cores' not in predictor_resource_config:
            raise ValueError(
                "predictor_resource_config must contain the keys 'memory' and 'cores'"
            )
Ejemplo n.º 17
0
def export(model_path, model_name, model_version=1, overwrite=False):
    """
    Copies a trained model to the Models directory in the project and creates the directory structure of:

    >>> Models
    >>>      |
    >>>      - model_name
    >>>                 |
    >>>                 - version_x
    >>>                 |
    >>>                 - version_y

    For example if you run this:

    >>> serving.export("iris_knn.pkl", "irisFlowerClassifier", 1, overwrite=True)

    it will copy the local model file "iris_knn.pkl" to /Projects/projectname/Models/irisFlowerClassifier/1/iris.knn.pkl
    on HDFS, and overwrite in case there already exists a file with the same name in the directory.

    If you run:

    >>> serving.export("Resources/iris_knn.pkl", "irisFlowerClassifier", 1, overwrite=True)

    it will first check if the path Resources/iris_knn.pkl exists on your local filesystem in the current working
    directory. If the path was not found, it will check in your project's HDFS directory and if it finds the model there
    it will copy it to /Projects/projectname/Models/irisFlowerClassifier/1/iris.knn.pkl

    If "model" is a directory on the local path exported by tensorflow, and you run:
:
    >>> serving.export("/model/", "mnist", 1, overwrite=True)

    It will copy the model directory contents to /Projects/projectname/Models/mnist/1/ , e.g the "model.pb" file and
    the "variables" directory.

    Args:
        :model_path: path to the trained model (HDFS or local)
        :model_name: name of the model/serving
        :model_version: version of the model/serving
        :overwrite: boolean flag whether to overwrite in case a serving already exists in the exported directory

    Returns:
        The path to where the model was exported

    Raises:
        :ValueError: if there was an error with the exportation of the model due to invalid user input
    """

    if not hdfs.exists(model_path) and not os.path.exists(model_path):
        raise ValueError("the provided model_path: {} , does not exist in HDFS or on the local filesystem".format(
            model_path))

    # Create directory in HDFS to put the model files
    project_path = hdfs.project_path()
    model_dir_hdfs = project_path + constants.MODEL_SERVING.MODELS_DATASET + \
                     constants.DELIMITERS.SLASH_DELIMITER + str(model_name) + \
                     constants.DELIMITERS.SLASH_DELIMITER + str(model_version) + \
                     constants.DELIMITERS.SLASH_DELIMITER
    if not hdfs.exists(model_dir_hdfs):
        hdfs.mkdir(model_dir_hdfs)

    if (not overwrite) and hdfs.exists(model_dir_hdfs) and hdfs.isfile(model_dir_hdfs):
        raise ValueError("Could not create model directory: {}, the path already exists and is a file, "
                         "set flag overwrite=True "
                         "to remove the file and create the correct directory structure".format(model_dir_hdfs))

    if overwrite and hdfs.exists(model_dir_hdfs) and hdfs.isfile(model_dir_hdfs):
        hdfs.delete(model_dir_hdfs)
        hdfs.mkdir(model_dir_hdfs)


    # Export the model files
    if os.path.exists(model_path):
        return _export_local_model(model_path, model_dir_hdfs, overwrite)
    else:
        return _export_hdfs_model(model_path, model_dir_hdfs, overwrite)
Ejemplo n.º 18
0
def _run(sc,
         map_fun,
         run_id,
         args_dict=None,
         local_logdir=False,
         name="no-name"):
    """

    Args:
        sc:
        map_fun:
        args_dict:
        local_logdir:
        name:

    Returns:

    """

    app_id = str(sc.applicationId)

    if args_dict == None:
        num_executions = 1
    else:
        arg_lists = list(args_dict.values())
        currentLen = len(arg_lists[0])
        for i in range(len(arg_lists)):
            if currentLen != len(arg_lists[i]):
                raise ValueError(
                    'Length of each function argument list must be equal')
            num_executions = len(arg_lists[i])

    sc.setJobGroup(os.environ['ML_ID'],
                   "{} | Launcher running experiment".format(name))
    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Force execution on executor, since GPU is located on executor
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir))

    print('Finished Experiment \n')

    # For single run return .return if exists
    if args_dict == None:
        path_to_return = experiment_utils._get_logdir(
            app_id, run_id) + '/.outputs.json'
        if hdfs.exists(path_to_return):
            return_json = hdfs.load(path_to_return)
            return_dict = json.loads(return_json)
            return experiment_utils._get_logdir(app_id, run_id), return_dict
        else:
            return experiment_utils._get_logdir(app_id, run_id), None
    elif num_executions == 1:
        arg_count = six.get_function_code(map_fun).co_argcount
        arg_names = six.get_function_code(map_fun).co_varnames
        argIndex = 0
        param_string = ''
        while arg_count > 0:
            param_name = arg_names[argIndex]
            param_val = args_dict[param_name][0]
            param_string += str(param_name) + '=' + str(param_val) + '&'
            arg_count -= 1
            argIndex += 1
        param_string = param_string[:-1]
        path_to_return = experiment_utils._get_logdir(
            app_id, run_id) + '/' + param_string + '/.outputs.json'
        if hdfs.exists(path_to_return):
            return_json = hdfs.load(path_to_return)
            return_dict = json.loads(return_json)
            return experiment_utils._get_logdir(app_id, run_id), return_dict
        else:
            return experiment_utils._get_logdir(app_id, run_id), None
    else:
        return experiment_utils._get_logdir(app_id, run_id), None
Ejemplo n.º 19
0
def _get_best(args_dict, num_combinations, arg_names, arg_count,
              hdfs_appid_dir, optimization_key):

    if not optimization_key:
        optimization_key = 'metric'

    max_hp = ''
    max_val = ''

    min_hp = ''
    min_val = ''

    min_return_dict = {}
    max_return_dict = {}

    results = []

    first = True

    for i in range(num_combinations):

        argIndex = 0
        param_string = ''

        num_args = arg_count

        while num_args > 0:
            #Get args for executor and run function
            param_name = arg_names[argIndex]
            param_val = args_dict[param_name][i]
            param_string += str(param_name) + '=' + str(param_val) + '&'
            num_args -= 1
            argIndex += 1

        param_string = param_string[:-1]

        path_to_return = hdfs_appid_dir + '/' + param_string + '/.outputs.json'

        assert hdfs.exists(
            path_to_return), 'Could not find .return file on path: {}'.format(
                path_to_return)

        with hdfs.open_file(path_to_return, flags="r") as fi:
            return_dict = json.loads(fi.read())
            fi.close()

            # handle case when dict with 1 key is returned
            if optimization_key == 'metric' and len(return_dict.keys()) == 1:
                optimization_key = list(return_dict.keys())[0]

            metric = float(return_dict[optimization_key])

            if first:
                max_hp = param_string
                max_val = metric
                max_return_dict = return_dict
                min_hp = param_string
                min_val = metric
                min_return_dict = return_dict
                first = False

            if metric > max_val:
                max_val = metric
                max_hp = param_string
                max_return_dict = return_dict
            if metric < min_val:
                min_val = metric
                min_hp = param_string
                min_return_dict = return_dict

        results.append(metric)

    avg = sum(results) / float(len(results))

    return max_val, max_hp, min_val, min_hp, avg, max_return_dict, min_return_dict
Ejemplo n.º 20
0
def export(model_path, model_name, model_version=None, overwrite=False, metrics=None, description=None, synchronous=True, synchronous_timeout=120):
    """
    Copies a trained model to the Models directory in the project and creates the directory structure of:

    >>> Models
    >>>      |
    >>>      - model_name
    >>>                 |
    >>>                 - version_x
    >>>                 |
    >>>                 - version_y

    For example if you run this:

    >>> from hops import model
    >>> model.export("iris_knn.pkl", "irisFlowerClassifier", metrics={'accuracy': accuracy})

    It will copy the local model file "iris_knn.pkl" to /Projects/projectname/Models/irisFlowerClassifier/1/iris.knn.pkl
    on HDFS, and overwrite in case there already exists a file with the same name in the directory.

    If "model" is a directory on the local path exported by TensorFlow, and you run:

    >>> model.export("/model", "mnist", metrics={'accuracy': accuracy, 'loss': loss})

    It will copy the model directory contents to /Projects/projectname/Models/mnist/1/ , e.g the "model.pb" file and
    the "variables" directory.

    Args:
        :model_path: path to the trained model (HDFS or local)
        :model_name: name of the model
        :model_version: version of the model
        :overwrite: boolean flag whether to overwrite in case a model already exists in the exported directory
        :metrics: dict of evaluation metrics to attach to model
        :description: description about the model
        :synchronous: whether to synchronously wait for the model to be indexed in the models rest endpoint
        :synchronous_timeout: max timeout in seconds for waiting for the model to be indexed

    Returns:
        The path to where the model was exported

    Raises:
        :ValueError: if there was an error with th of the model due to invalid user input
        :ModelNotFound: if the model was not found
    """

    # Make sure model name is a string, users could supply numbers
    model_name = str(model_name)

    if not isinstance(model_path, string_types):
        model_path = model_path.decode()

    if not description:
        description = 'A collection of models for ' + model_name

    project_path = hdfs.project_path()

    assert hdfs.exists(project_path + "Models"), "Your project is missing a dataset named Models, please create it."

    if not hdfs.exists(model_path) and not os.path.exists(model_path):
        raise ValueError("the provided model_path: {} , does not exist in HDFS or on the local filesystem".format(
            model_path))

    # make sure metrics are numbers
    if metrics:
        _validate_metadata(metrics)

    model_dir_hdfs = project_path + constants.MODEL_SERVING.MODELS_DATASET + \
                     constants.DELIMITERS.SLASH_DELIMITER + model_name + constants.DELIMITERS.SLASH_DELIMITER

    if not hdfs.exists(model_dir_hdfs):
        hdfs.mkdir(model_dir_hdfs)
        hdfs.chmod(model_dir_hdfs, "ug+rwx")

    # User did not specify model_version, pick the current highest version + 1, set to 1 if no model exists
    version_list = []
    if not model_version and hdfs.exists(model_dir_hdfs):
        model_version_directories = hdfs.ls(model_dir_hdfs)
        for version_dir in model_version_directories:
            try:
                if hdfs.isdir(version_dir):
                    version_list.append(int(version_dir[len(model_dir_hdfs):]))
            except:
                pass
        if len(version_list) > 0:
            model_version = max(version_list) + 1

    if not model_version:
        model_version = 1

    # Path to directory in HDFS to put the model files
    model_version_dir_hdfs = model_dir_hdfs + str(model_version)

    # If version directory already exists and we are not overwriting it then fail
    if not overwrite and hdfs.exists(model_version_dir_hdfs):
        raise ValueError("Could not create model directory: {}, the path already exists, "
                         "set flag overwrite=True "
                         "to remove the version directory and create the correct directory structure".format(model_version_dir_hdfs))

    # Overwrite version directory by deleting all content (this is needed for Provenance to register Model as deleted)
    if overwrite and hdfs.exists(model_version_dir_hdfs):
       hdfs.delete(model_version_dir_hdfs, recursive=True)
       hdfs.mkdir(model_version_dir_hdfs)

    # At this point we can create the version directory if it does not exists
    if not hdfs.exists(model_version_dir_hdfs):
       hdfs.mkdir(model_version_dir_hdfs)

    # Export the model files
    if os.path.exists(model_path):
        export_dir=_export_local_model(model_path, model_version_dir_hdfs, overwrite)
    else:
        export_dir=_export_hdfs_model(model_path, model_version_dir_hdfs, overwrite)

    print("Exported model " + model_name + " as version " + str(model_version) + " successfully.")

    jobName=None
    if constants.ENV_VARIABLES.JOB_NAME_ENV_VAR in os.environ:
        jobName = os.environ[constants.ENV_VARIABLES.JOB_NAME_ENV_VAR]

    kernelId=None
    if constants.ENV_VARIABLES.KERNEL_ID_ENV_VAR in os.environ:
        kernelId = os.environ[constants.ENV_VARIABLES.KERNEL_ID_ENV_VAR]

    # Attach modelName_modelVersion to experiment directory
    model_summary = {'name': model_name, 'version': model_version, 'metrics': metrics,
    'experimentId': None, 'description': description, 'jobName': jobName, 'kernelId': kernelId}
    if 'ML_ID' in os.environ:
        # Attach link from experiment to model
        experiment_utils._attach_model_link_xattr(os.environ['ML_ID'], model_name + '_' + str(model_version))
        # Attach model metadata to models version folder
        model_summary['experimentId'] = os.environ['ML_ID']
        experiment_utils._attach_model_xattr(model_name + "_" + str(model_version), experiment_utils.dumps(model_summary))
    else:
        experiment_utils._attach_model_xattr(model_name + "_" + str(model_version), experiment_utils.dumps(model_summary))

    # Model metadata is attached asynchronously by Epipe, therefore this necessary to ensure following steps in a pipeline will not fail
    if synchronous:
        start_time = time.time()
        sleep_seconds = 5
        for i in range(int(synchronous_timeout/sleep_seconds)):
            try:
                time.sleep(sleep_seconds)
                print("Polling " + model_name + " version " + str(model_version) + " for model availability.")
                resp = get_model(model_name, model_version)
                if resp.ok:
                    print("Model now available.")
                    return
                print(model_name + " not ready yet, retrying in " + str(sleep_seconds) + " seconds.")
            except ModelNotFound:
                pass
        print("Model not available during polling, set a higher value for synchronous_timeout to wait longer.")

    return export_dir
Ejemplo n.º 21
0
    def _wrapper_fun(iter):
        """
        Wraps the user supplied training function in order to be passed to the
        Spark Executors.

        Args:
            iter:

        Returns:

        """
        experiment_utils._set_ml_id(app_id, run_id)

        # get task context information to determine executor identifier
        partition_id, task_attempt = util.get_partition_attempt_id()

        client = rpc.Client(server_addr, partition_id, task_attempt,
                            hb_interval, secret)
        log_file = (log_dir + "/executor_" + str(partition_id) + "_" +
                    str(task_attempt) + ".log")

        # save the builtin print
        original_print = __builtin__.print

        reporter = Reporter(log_file, partition_id, task_attempt,
                            original_print)

        def maggy_print(*args, **kwargs):
            """Maggy custom print() function."""
            original_print(*args, **kwargs)
            reporter.log(" ".join(str(x) for x in args), True)

        # override the builtin print
        __builtin__.print = maggy_print

        try:
            client_addr = client.client_addr

            host_port = client_addr[0] + ":" + str(client_addr[1])

            exec_spec = {}
            exec_spec["partition_id"] = partition_id
            exec_spec["task_attempt"] = task_attempt
            exec_spec["host_port"] = host_port
            exec_spec["trial_id"] = None

            reporter.log("Registering with experiment driver", False)
            client.register(exec_spec)

            client.start_heartbeat(reporter)

            # blocking
            trial_id, parameters = client.get_suggestion(reporter)

            while not client.done:
                if experiment_type == "ablation":
                    ablation_params = {
                        "ablated_feature":
                        parameters.get("ablated_feature", "None"),
                        "ablated_layer":
                        parameters.get("ablated_layer", "None"),
                    }
                    parameters.pop("ablated_feature")
                    parameters.pop("ablated_layer")

                tb_logdir = log_dir + "/" + trial_id
                trial_log_file = tb_logdir + "/output.log"
                reporter.set_trial_id(trial_id)

                # If trial is repeated, delete trial directory, except log file
                if hopshdfs.exists(tb_logdir):
                    util._clean_dir(tb_logdir, [trial_log_file])
                else:
                    hopshdfs.mkdir(tb_logdir)

                reporter.init_logger(trial_log_file)
                tensorboard._register(tb_logdir)
                if experiment_type == "ablation":
                    hopshdfs.dump(
                        json.dumps(ablation_params,
                                   default=util.json_default_numpy),
                        tb_logdir + "/.hparams.json",
                    )

                else:
                    hopshdfs.dump(
                        json.dumps(parameters,
                                   default=util.json_default_numpy),
                        tb_logdir + "/.hparams.json",
                    )

                try:
                    reporter.log("Starting Trial: {}".format(trial_id), False)
                    reporter.log("Trial Configuration: {}".format(parameters),
                                 False)

                    if experiment_type == "optimization":
                        tensorboard._write_hparams(parameters, trial_id)

                    sig = inspect.signature(map_fun)
                    if sig.parameters.get("reporter", None):
                        retval = map_fun(**parameters, reporter=reporter)
                    else:
                        retval = map_fun(**parameters)

                    if experiment_type == "optimization":
                        tensorboard._write_session_end()

                    retval = util._handle_return_val(retval, tb_logdir,
                                                     optimization_key,
                                                     trial_log_file)

                except exceptions.EarlyStopException as e:
                    retval = e.metric
                    reporter.log("Early Stopped Trial.", False)

                reporter.log("Finished Trial: {}".format(trial_id), False)
                reporter.log("Final Metric: {}".format(retval), False)
                client.finalize_metric(retval, reporter)

                # blocking
                trial_id, parameters = client.get_suggestion(reporter)

        except:  # noqa: E722
            reporter.log(traceback.format_exc(), False)
            raise
        finally:
            reporter.close_logger()
            client.stop()
            client.close()
Ejemplo n.º 22
0
    def __init__(self, experiment_type, **kwargs):

        global driver_secret

        # COMMON EXPERIMENT SETUP
        self._final_store = []
        self._trial_store = {}
        self.num_executors = kwargs.get("num_executors")
        self._message_q = queue.Queue()
        self.name = kwargs.get("name")
        self.experiment_done = False
        self.worker_done = False
        self.hb_interval = kwargs.get("hb_interval")
        self.description = kwargs.get("description")
        self.experiment_type = experiment_type
        self.es_interval = kwargs.get("es_interval")
        self.es_min = kwargs.get("es_min")

        # TYPE-SPECIFIC EXPERIMENT SETUP
        if self.experiment_type == "optimization":
            # set up an optimization experiment

            self.num_trials = kwargs.get("num_trials", 1)

            searchspace = kwargs.get("searchspace")
            if isinstance(searchspace, Searchspace):
                self.searchspace = searchspace
            elif searchspace is None:
                self.searchspace = Searchspace()
            else:
                raise Exception(
                    "The experiment's search space should be an instance of maggy.Searchspace, "
                    "but it is {0} (of type '{1}').".format(
                        str(searchspace),
                        type(searchspace).__name__))

            optimizer = kwargs.get("optimizer")

            if optimizer is None:
                if len(self.searchspace.names()) == 0:
                    self.optimizer = SingleRun()
                else:
                    raise Exception(
                        "Searchspace has to be empty or None to use without optimizer"
                    )
            elif isinstance(optimizer, str):
                if optimizer.lower() == "randomsearch":
                    self.optimizer = RandomSearch()
                elif optimizer.lower() == "asha":
                    self.optimizer = Asha()
                elif optimizer.lower() == "none":
                    if len(self.searchspace.names()) == 0:
                        self.optimizer = SingleRun()
                    else:
                        raise Exception(
                            "Searchspace has to be empty or None to use without Optimizer."
                        )
                else:
                    raise Exception(
                        "Unknown Optimizer. Can't initialize experiment driver."
                    )
            elif isinstance(optimizer, AbstractOptimizer):
                self.optimizer = optimizer
                print("Custom Optimizer initialized.")
            else:
                raise Exception(
                    "The experiment's optimizer should either be an string indicating the name "
                    "of an implemented optimizer (such as 'randomsearch') or an instance of "
                    "maggy.optimizer.AbstractOptimizer, "
                    "but it is {0} (of type '{1}').".format(
                        str(optimizer),
                        type(optimizer).__name__))

            direction = kwargs.get("direction", "max")
            if isinstance(direction,
                          str) and direction.lower() in ["min", "max"]:
                self.direction = direction.lower()
            else:
                raise Exception(
                    "The experiment's direction should be an string (either 'min' or 'max') "
                    "but it is {0} (of type '{1}').".format(
                        str(direction),
                        type(direction).__name__))

            es_policy = kwargs.get("es_policy")
            if isinstance(es_policy, str):
                if es_policy.lower() == "median":
                    self.earlystop_check = MedianStoppingRule.earlystop_check
                elif es_policy.lower() == "none":
                    self.earlystop_check = NoStoppingRule.earlystop_check
                else:
                    raise Exception(
                        "The experiment's early stopping policy should either be a string ('median' or 'none') "
                        "or a custom policy that is an instance of maggy.earlystop.AbstractEarlyStop, "
                        "but it is {0} (of type '{1}').".format(
                            str(es_policy),
                            type(es_policy).__name__))
            elif isinstance(es_policy, AbstractEarlyStop):
                self.earlystop_check = es_policy.earlystop_check
                print("Custom Early Stopping policy initialized.")
            else:
                raise Exception(
                    "The experiment's early stopping policy should either be a string ('median' or 'none') "
                    "or a custom policy that is an instance of maggy.earlystop.AbstractEarlyStop, "
                    "but it is {0} (of type '{1}').".format(
                        str(es_policy),
                        type(es_policy).__name__))

            self.es_interval = kwargs.get("es_interval")
            self.es_min = kwargs.get("es_min")

            self.result = {
                "best_val": "n.a.",
                "num_trials": 0,
                "early_stopped": 0
            }

        elif self.experiment_type == "ablation":
            # set up an ablation study experiment
            self.earlystop_check = NoStoppingRule.earlystop_check

            ablation_study = kwargs.get("ablation_study")
            if isinstance(ablation_study, AblationStudy):
                self.ablation_study = ablation_study
            else:
                raise Exception(
                    "The experiment's ablation study configuration should be an instance of "
                    "maggy.ablation.AblationStudy, "
                    "but it is {0} (of type '{1}').".format(
                        str(ablation_study),
                        type(ablation_study).__name__))

            searchspace = kwargs.get("searchspace")
            if not searchspace:
                self.searchspace = Searchspace()
            else:
                raise Exception(
                    "The experiment's search space should be None for ablation experiments, "
                    "but it is {0} (of type '{1}').".format(
                        str(searchspace),
                        type(searchspace).__name__))

            ablator = kwargs.get("ablator")
            if isinstance(ablator, str):
                if ablator.lower() == "loco":
                    self.ablator = LOCO(ablation_study, self._final_store)
                    self.num_trials = self.ablator.get_number_of_trials()
                    if self.num_executors > self.num_trials:
                        self.num_executors = self.num_trials
                else:
                    raise Exception(
                        "The experiment's ablation study policy should either be a string ('loco') "
                        "or a custom policy that is an instance of maggy.ablation.ablation.AbstractAblator, "
                        "but it is {0} (of type '{1}').".format(
                            str(ablator),
                            type(ablator).__name__))
            elif isinstance(ablator, AbstractAblator):
                self.ablator = ablator
                print("Custom Ablator initialized. \n")
            else:
                raise Exception(
                    "The experiment's ablation study policy should either be a string ('loco') "
                    "or a custom policy that is an instance of maggy.ablation.ablation.AbstractAblator, "
                    "but it is {0} (of type '{1}').".format(
                        str(ablator),
                        type(ablator).__name__))

            self.result = {
                "best_val": "n.a.",
                "num_trials": 0,
                "early_stopped": "n.a"
            }
        else:
            raise Exception(
                "Unknown experiment type. experiment_type should be either 'optimization' or 'ablation', "
                "but it is {0}.".format(str(self.experiment_type)))

        # FINALIZE EXPERIMENT SETUP
        self.server = rpc.Server(self.num_executors)
        if not driver_secret:
            driver_secret = self._generate_secret(
                ExperimentDriver.SECRET_BYTES)
        self._secret = driver_secret
        self.job_start = datetime.now()
        self.executor_logs = ""
        self.maggy_log = ""
        self.log_lock = threading.RLock()
        self.log_file = kwargs.get("log_dir") + "/maggy.log"
        self.log_dir = kwargs.get("log_dir")
        self.exception = None

        # Open File desc for HDFS to log
        if not hopshdfs.exists(self.log_file):
            hopshdfs.dump("", self.log_file)
        self.fd = hopshdfs.open_file(self.log_file, flags="w")
Ejemplo n.º 23
0
 def exists(self, hdfs_path, project=None):
     return hopshdfs.exists(hdfs_path, project=project)