def read_featureframe(self, spark): """ Reads a training dataset in petastorm format from HopsFS Args: :spark: the spark session Returns: dataframe with the data of the training dataset Raises: :TrainingDatasetNotFound: if the requested training dataset could not be found """ if hasattr(self, 'training_dataset') and \ self.training_dataset.training_dataset_type != constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE: if hdfs.exists(self.path): spark_df = spark.read.parquet(self.path) elif hdfs.exists( self.path + constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX): spark_df = spark.read.parquet( self.path + constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX) if not hdfs.exists(self.path) and not hdfs.exists( self.path + constants.FEATURE_STORE.TRAINING_DATASET_PETASTORM_SUFFIX): raise TrainingDatasetNotFound( "Could not find a training dataset in folder {} " "or in file {}".format( self.path, self.path + constants.FEATURE_STORE. TRAINING_DATASET_PETASTORM_SUFFIX)) else: spark_df = spark.read.parquet(self.path) return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)
def read_featureframe(self, spark): """ Reads a training dataset in CSV format from HopsFS Args: :spark: the spark session Returns: dataframe with the data of the training dataset Raises: :TrainingDatasetNotFound: if the requested training dataset could not be found """ if self.training_dataset.training_dataset_type != constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE: if hdfs.exists(self.path): spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option( constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option( constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER, constants.DELIMITERS.COMMA_DELIMITER).load(self.path) elif hdfs.exists(self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX): spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option( constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option( constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER, constants.DELIMITERS.COMMA_DELIMITER).load(self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX) if not hdfs.exists(self.path) and not hdfs.exists(self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX): raise TrainingDatasetNotFound("Could not find a training dataset in folder {} or in file {}".format( self.path, self.path + constants.FEATURE_STORE.TRAINING_DATASET_CSV_SUFFIX)) else: spark_df = spark.read.format(constants.FEATURE_STORE.TRAINING_DATASET_CSV_FORMAT).option( constants.SPARK_CONFIG.SPARK_WRITE_HEADER, "true").option( constants.SPARK_CONFIG.SPARK_WRITE_DELIMITER, constants.DELIMITERS.COMMA_DELIMITER).load(self.path) return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)
def read_featureframe(self, spark): """ Reads a training dataset in tfrecords format from HopsFS Args: :spark: the spark session Returns: dataframe with the data of the training dataset Raises: :TrainingDatasetNotFound: if the requested training dataset could not be found """ if hasattr(self, 'training_dataset') and self.training_dataset.training_dataset_type != \ constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE: if hdfs.exists(self.path): spark_df = spark.read.format( constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT ).option( constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE, constants.SPARK_CONFIG. SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE).load(self.path) elif hdfs.exists( self.path + constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX): spark_df = spark.read.format( constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT ).option( constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE, constants.SPARK_CONFIG. SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE ).load( self.path + constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX) if not hdfs.exists(self.path) and not hdfs.exists( self.path + constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_SUFFIX): raise TrainingDatasetNotFound( "Could not find a training dataset in folder {} or in file {}" .format( self.path, self.path + constants.FEATURE_STORE. TRAINING_DATASET_TFRECORDS_SUFFIX)) else: spark_df = spark.read.format( constants.FEATURE_STORE.TRAINING_DATASET_TFRECORDS_FORMAT ).option( constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE, constants.SPARK_CONFIG.SPARK_TF_CONNECTOR_RECORD_TYPE_EXAMPLE ).load(self.path) return fs_utils._return_dataframe_type(spark_df, self.dataframe_type)
def _version_resources(versioned_resources, rundir): """ Args: versioned_resources: rundir: Returns: """ if not versioned_resources: return None pyhdfs_handle = hdfs.get() pyhdfs_handle.create_directory(rundir) endpoint_prefix = hdfs.project_path() versioned_paths = [] for hdfs_resource in versioned_resources: if pydoop.hdfs.path.exists(hdfs_resource): log("Versoning resource '%s' in rundir '%s'" % (hdfs_resource, rundir)) # Remove the file if it exists target_path = os.path.join(rundir, os.path.basename(hdfs_resource)) if hdfs.exists(target_path): hdfs.rmr(target_path) hdfs.cp(hdfs_resource, rundir) path, filename = os.path.split(hdfs_resource) versioned_paths.append( rundir.replace(endpoint_prefix, '') + '/' + filename) else: log("Resource not found '%s'" % hdfs_resource, level='warning') #raise Exception('Could not find resource in specified path: ' + hdfs_resource) return ', '.join(versioned_paths)
def _validate_user_serving_input(model_path, model_name, serving_type, model_version, batching_enabled, num_partitions, num_replicas, instances): """ Validate user input on the client side before sending REST call to Hopsworks (additional validation will be done in the backend) Args: :model_path: path to the model or artifact being served :model_name: the name of the serving to create :serving_type: the type of serving :model_version: version of the serving :batching_enabled: boolean flag whether to enable batching for inference requests to the serving :num_partitions: kafka partitions :num_replicas: kafka replicas :instances: the number of serving instances (the more instances the more inference requests can be served in parallel) Returns: None Raises: :ValueError: if the serving input failed the validation """ name_pattern = re.compile("^[a-zA-Z0-9]+$") if len(model_name) > 256 or model_name == "" or not name_pattern.match( model_name): raise ValueError( "Name of serving cannot be empty, cannot exceed 256 characters and must match the regular " "expression: ^[a-zA-Z0-9]+$, the provided name: {} is not valid". format(model_name)) if not hdfs.exists(model_path): raise ValueError( "The model/artifact path must exist in HDFS, the provided path: {} " "does not exist".format(model_path)) if serving_type not in constants.MODEL_SERVING.SERVING_TYPES: raise ValueError( "The provided serving_type: {} is not supported, supported " "serving types are: {}".format( serving_type, ",".join(constants.MODEL_SERVING.SERVING_TYPES))) if not isinstance(model_version, int): raise ValueError( "The model version must be an integer, the provided version is not: {}" .format(model_version)) if serving_type == constants.MODEL_SERVING.SERVING_TYPE_TENSORFLOW: if not isinstance(num_replicas, int): raise ValueError( "Number of kafka topic replicas must be an integer, the provided num replicas " "is not: {}".format(model_version)) if not isinstance(num_partitions, int): raise ValueError( "Number of kafka topic partitions must be an integer, the provided num partitions " "is not: {}".format(num_partitions)) if not isinstance(batching_enabled, bool): raise ValueError( "Batching enabled must be a boolean, the provided value " "is not: {}".format(batching_enabled)) if not isinstance(instances, int): raise ValueError("The number of serving instances must be an integer, " "the provided version is not: {}".format(instances))
def _create_experiment_dir(app_id, run_id): experiment_path = _get_logdir(app_id, run_id) if hdfs.exists(experiment_path): hdfs.delete(experiment_path, recursive=True) hdfs.mkdir(experiment_path)
def init_logger(self, trial_log_file): """Initializes the trial log file """ self.trial_log_file = trial_log_file # Open trial log file descriptor if not hopshdfs.exists(self.trial_log_file): hopshdfs.dump("", self.trial_log_file) self.trial_fd = hopshdfs.open_file(self.trial_log_file, flags="w")
def read_featureframe(self, spark): """ Reads a training dataset in hdf5 format from HopsFS Args: :spark: the spark session Returns: dataframe with the data of the training dataset Raises: :TrainingDatasetNotFound: if the requested training dataset could not be found :CouldNotConvertDataframe: if the hdf5 dataset could not be converted to a spark dataframe :HDF5DatasetFormatNotSupportedForExternalTrainingDatasets: if the user tries to read an external training dataset in the .hdf5 format. """ if not hasattr(self, 'training_dataset') or \ self.training_dataset.training_dataset_type \ == constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE: raise HDF5DatasetFormatNotSupportedForExternalTrainingDatasets( "The .hdf5 dataset format is not " "supported for external training datasets.") if not hdfs.exists( self.path + constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX): raise TrainingDatasetNotFound( "Could not find a training dataset in file {}".format( self.path + constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX)) tf = TemporaryFile() data = hdfs.load(self.path + constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX) tf.write(data) tf.seek(0) hdf5_file = h5py.File(tf) np_array = hdf5_file[self.training_dataset.name][()] if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_NUMPY: return np_array if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PYTHON: return np_array.tolist() if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_SPARK \ or self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PANDAS: if np_array.ndim != 2: raise CouldNotConvertDataframe( "Cannot convert numpy array that do not have two dimensions to a dataframe. " "The number of dimensions are: {}".format(np_array.ndim)) num_cols = np_array.shape[1] dataframe_dict = {} for n_col in list(range(num_cols)): col_name = "col_" + str(n_col) dataframe_dict[col_name] = np_array[:, n_col] pandas_df = pd.DataFrame(dataframe_dict) sc = spark.sparkContext sql_context = SQLContext(sc) return fs_utils._return_dataframe_type( sql_context.createDataFrame(pandas_df), self.dataframe_type)
def _build_summary_json(logdir): """Builds the summary json to be read by the experiments service. """ combinations = [] for trial in hopshdfs.ls(logdir): if hopshdfs.isdir(trial): return_file = trial + "/.outputs.json" hparams_file = trial + "/.hparams.json" if hopshdfs.exists(return_file) and hopshdfs.exists(hparams_file): metric_arr = experiment_utils._convert_return_file_to_arr( return_file) hparams_dict = _load_hparams(hparams_file) combinations.append({ "parameters": hparams_dict, "outputs": metric_arr }) return json.dumps({"combinations": combinations}, default=json_default_numpy)
def _get_experiments_dir(): """ Gets the root folder where the experiments are writing their results Returns: The folder where the experiments are writing results """ assert hdfs.exists( hdfs.project_path() + "Experiments" ), "Your project is missing a dataset named Experiments, please create it." return hdfs.project_path() + "Experiments"
def _upload_file_output(retval, hdfs_exec_logdir): if type(retval) is dict: for metric_key in retval.keys(): value = str(retval[metric_key]) if '/' in value or os.path.exists(os.getcwd() + '/' + value): if os.path.exists(value): # absolute path if hdfs.exists(hdfs_exec_logdir + '/' + value.split('/')[-1]): hdfs.delete(hdfs_exec_logdir + '/' + value.split('/')[-1], recursive=False) pydoop.hdfs.put(value, hdfs_exec_logdir) os.remove(value) hdfs_exec_logdir = hdfs.abs_path(hdfs_exec_logdir) retval[metric_key] = hdfs_exec_logdir[ len(hdfs.abs_path(hdfs.project_path()) ):] + '/' + value.split('/')[-1] elif os.path.exists(os.getcwd() + '/' + value): # relative path output_file = os.getcwd() + '/' + value if hdfs.exists(hdfs_exec_logdir + '/' + value): hdfs.delete(hdfs_exec_logdir + '/' + value, recursive=False) pydoop.hdfs.put(value, hdfs_exec_logdir) os.remove(output_file) hdfs_exec_logdir = hdfs.abs_path(hdfs_exec_logdir) retval[metric_key] = hdfs_exec_logdir[ len(hdfs.abs_path(hdfs.project_path()) ):] + '/' + output_file.split('/')[-1] elif value.startswith('Experiments') and value.endswith( 'output.log'): continue elif value.startswith('Experiments') and hdfs.exists( hdfs.project_path() + '/' + value): hdfs.cp(hdfs.project_path() + '/' + value, hdfs_exec_logdir) else: raise Exception( 'Could not find file or directory on path ' + str(value))
def _run(sc, train_fn, run_id, local_logdir=False, name="no-name", evaluator=False): """ Args: sc: train_fn: local_logdir: name: Returns: """ app_id = str(sc.applicationId) num_executions = util.num_executors() #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Make SparkUI intuitive by grouping jobs sc.setJobGroup( os.environ['ML_ID'], "{} | ParameterServerStrategy - Distributed Training".format(name)) server = parameter_server_reservation.Server(num_executions) server_addr = server.start() num_ps = util.num_param_servers() #Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( _prepare_func(app_id, run_id, train_fn, local_logdir, server_addr, num_ps, evaluator)) logdir = experiment_utils._get_logdir(app_id, run_id) print('Finished Experiment \n') path_to_return = logdir + '/.outputs.json' if hdfs.exists(path_to_return): with hdfs.open_file(path_to_return, flags="r") as fi: contents = fi.read() fi.close() return logdir, json.loads(contents) return logdir, None
def initialize_logger(self, exp_dir): """Initialize logger of optimizer :param exp_dir: path of experiment directory :rtype exp_dir: str """ # configure logger self.log_file = exp_dir + "/pruner.log" if not hdfs.exists(self.log_file): hdfs.dump("", self.log_file) self.fd = hdfs.open_file(self.log_file, flags="w") self._log("Initialized Pruner Logger")
def __init__(self, log_file, partition_id, task_attempt, print_executor): self.metric = None self.lock = threading.RLock() self.stop = False self.trial_id = None self.trial_log_file = None self.logs = "" self.log_file = log_file self.partition_id = partition_id self.task_attempt = task_attempt self.print_executor = print_executor # Open executor log file descriptor # This log is for all maggy system related log messages if not hopshdfs.exists(log_file): hopshdfs.dump("", log_file) self.fd = hopshdfs.open_file(log_file, flags="w") self.trial_fd = None
def _build_summary_json(logdir): combinations = [] return_files = [] hp_arr = None output_arr = None for experiment_dir in hdfs.ls(logdir): runs = hdfs.ls(experiment_dir, recursive=True) for run in runs: if run.endswith('.outputs.json'): return_files.append(run) for return_file in return_files: output_arr = _convert_return_file_to_arr(return_file) param_file = return_file.replace('outputs.json', 'hparams.json') if hdfs.exists(param_file): hp_arr = _convert_param_to_arr(param_file) combinations.append({'parameters': hp_arr, 'outputs': output_arr}) return dumps({'combinations': combinations})
def _validate_user_serving_input( serving_name, model_path, model_version, artifact_version, transformer, model_server, kfserving, batching_enabled, topic_name, num_partitions, num_replicas, inference_logging, instances, transformer_instances, predictor_resource_config): """ Validate user input on the client side before sending REST call to Hopsworks (additional validation will be done in the backend) Args: :serving_name: the name of the serving to create :model_path: path to the model or artifact being served :model_version: version of the model to serve :artifact_version: version of the artifact to serve :transformer: path to the transformer script :model_server: name of the model server to deploy, e.g "TENSORFLOW_SERVING" or "FLASK" :kfserving: boolean flag whether to serve the model using KFServing serving tool :batching_enabled: boolean flag whether to enable batching for inference requests to the serving :topic_name: name of the kafka topic for inference logging, e.g "CREATE" to create a new one, "NONE" to not use kafka topic or an existent topic name :num_partitions: if a new kafka topic is to created, number of partitions of the new topic :num_replicas: if a new kafka topic is to created, replication factor of the new topic :inference_logging: inference data to log into the Kafka topic, e.g "MODEL_INPUTS", "PREDICTIONS" or "ALL" :instances: the number of serving instances (the more instances the more inference requests can be served in parallel) :transformer_instances: the number of transformer instances (the more instances the more inference requests can be served in parallel) :predictor_resource_config: dict for setting resource configuration parameters required to serve the model, for example {'memory': 2048, 'cores': 1, 'gpus': 0}. Currently only supported if Hopsworks is deployed with Kubernetes installed. Returns: None Raises: :ValueError: if the serving input failed the validation """ name_pattern = re.compile("^[a-zA-Z0-9]+$") if len(serving_name) > 256 or serving_name == "" or not name_pattern.match( serving_name): raise ValueError( "Name of serving cannot be empty, cannot exceed 256 characters and must match the regular " "expression: ^[a-zA-Z0-9]+$, the provided name: {} is not valid". format(serving_name)) if not hdfs.exists(model_path): raise ValueError( "The model/artifact path must exist in HDFS, the provided path: {} " "does not exist".format(model_path)) if model_server not in constants.MODEL_SERVING.MODEL_SERVERS: raise ValueError( "The provided model_server: {} is not supported, supported " "model servers are: {}".format( model_server, ",".join(constants.MODEL_SERVING.MODEL_SERVERS))) if inference_logging is not None and inference_logging not in constants.MODEL_SERVING.INFERENCE_LOGGING_MODES: raise ValueError( "The provided inference_logging: {} is not supported, supported " "inference logging modes are: {}".format( inference_logging, ",".join(constants.MODEL_SERVING.INFERENCE_LOGGING_MODES))) if not isinstance(model_version, int): raise ValueError( "The model version must be an integer, the provided version is not: {}" .format(model_version)) if model_server == constants.MODEL_SERVING.MODEL_SERVER_TENSORFLOW_SERVING: if not isinstance(num_replicas, int): raise ValueError( "Number of kafka topic replicas must be an integer, the provided num replicas " "is not: {}".format(model_version)) if not isinstance(num_partitions, int): raise ValueError( "Number of kafka topic partitions must be an integer, the provided num partitions " "is not: {}".format(num_partitions)) if not isinstance(batching_enabled, bool): raise ValueError( "Batching enabled must be a boolean, the provided value " "is not: {}".format(batching_enabled)) if kfserving and batching_enabled: raise ValueError( "Batching requests is currently not supported in KFServing deployments" ) if kfserving and model_server == constants.MODEL_SERVING.MODEL_SERVER_FLASK: raise ValueError( "Flask is currently not supported for KFServing deployments") if not isinstance(instances, int): raise ValueError("The number of serving instances must be an integer, " "the provided version is not: {}".format(instances)) if not kfserving: if inference_logging is not None and inference_logging != constants.MODEL_SERVING.INFERENCE_LOGGING_ALL: raise ValueError( "Fine-grained inference logging is only supported in KFServing deployments" ) if topic_name is not None and topic_name != "NONE" and inference_logging != constants.MODEL_SERVING.INFERENCE_LOGGING_ALL: raise ValueError( "Inference logging mode 'ALL' is the only mode supported for non-KFServing deployments" ) if kfserving: if topic_name is not None and topic_name != "NONE" and inference_logging is None: raise ValueError( "Inference logging must be defined. Supported inference " "logging modes are: {}".format(",".join( constants.MODEL_SERVING.INFERENCE_LOGGING_MODES))) if predictor_resource_config is not None: if type(predictor_resource_config) is not dict: raise ValueError("predictor_resource_config must be a dict.") if 'memory' not in predictor_resource_config or 'cores' not in predictor_resource_config: raise ValueError( "predictor_resource_config must contain the keys 'memory' and 'cores'" )
def export(model_path, model_name, model_version=1, overwrite=False): """ Copies a trained model to the Models directory in the project and creates the directory structure of: >>> Models >>> | >>> - model_name >>> | >>> - version_x >>> | >>> - version_y For example if you run this: >>> serving.export("iris_knn.pkl", "irisFlowerClassifier", 1, overwrite=True) it will copy the local model file "iris_knn.pkl" to /Projects/projectname/Models/irisFlowerClassifier/1/iris.knn.pkl on HDFS, and overwrite in case there already exists a file with the same name in the directory. If you run: >>> serving.export("Resources/iris_knn.pkl", "irisFlowerClassifier", 1, overwrite=True) it will first check if the path Resources/iris_knn.pkl exists on your local filesystem in the current working directory. If the path was not found, it will check in your project's HDFS directory and if it finds the model there it will copy it to /Projects/projectname/Models/irisFlowerClassifier/1/iris.knn.pkl If "model" is a directory on the local path exported by tensorflow, and you run: : >>> serving.export("/model/", "mnist", 1, overwrite=True) It will copy the model directory contents to /Projects/projectname/Models/mnist/1/ , e.g the "model.pb" file and the "variables" directory. Args: :model_path: path to the trained model (HDFS or local) :model_name: name of the model/serving :model_version: version of the model/serving :overwrite: boolean flag whether to overwrite in case a serving already exists in the exported directory Returns: The path to where the model was exported Raises: :ValueError: if there was an error with the exportation of the model due to invalid user input """ if not hdfs.exists(model_path) and not os.path.exists(model_path): raise ValueError("the provided model_path: {} , does not exist in HDFS or on the local filesystem".format( model_path)) # Create directory in HDFS to put the model files project_path = hdfs.project_path() model_dir_hdfs = project_path + constants.MODEL_SERVING.MODELS_DATASET + \ constants.DELIMITERS.SLASH_DELIMITER + str(model_name) + \ constants.DELIMITERS.SLASH_DELIMITER + str(model_version) + \ constants.DELIMITERS.SLASH_DELIMITER if not hdfs.exists(model_dir_hdfs): hdfs.mkdir(model_dir_hdfs) if (not overwrite) and hdfs.exists(model_dir_hdfs) and hdfs.isfile(model_dir_hdfs): raise ValueError("Could not create model directory: {}, the path already exists and is a file, " "set flag overwrite=True " "to remove the file and create the correct directory structure".format(model_dir_hdfs)) if overwrite and hdfs.exists(model_dir_hdfs) and hdfs.isfile(model_dir_hdfs): hdfs.delete(model_dir_hdfs) hdfs.mkdir(model_dir_hdfs) # Export the model files if os.path.exists(model_path): return _export_local_model(model_path, model_dir_hdfs, overwrite) else: return _export_hdfs_model(model_path, model_dir_hdfs, overwrite)
def _run(sc, map_fun, run_id, args_dict=None, local_logdir=False, name="no-name"): """ Args: sc: map_fun: args_dict: local_logdir: name: Returns: """ app_id = str(sc.applicationId) if args_dict == None: num_executions = 1 else: arg_lists = list(args_dict.values()) currentLen = len(arg_lists[0]) for i in range(len(arg_lists)): if currentLen != len(arg_lists[i]): raise ValueError( 'Length of each function argument list must be equal') num_executions = len(arg_lists[i]) sc.setJobGroup(os.environ['ML_ID'], "{} | Launcher running experiment".format(name)) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir)) print('Finished Experiment \n') # For single run return .return if exists if args_dict == None: path_to_return = experiment_utils._get_logdir( app_id, run_id) + '/.outputs.json' if hdfs.exists(path_to_return): return_json = hdfs.load(path_to_return) return_dict = json.loads(return_json) return experiment_utils._get_logdir(app_id, run_id), return_dict else: return experiment_utils._get_logdir(app_id, run_id), None elif num_executions == 1: arg_count = six.get_function_code(map_fun).co_argcount arg_names = six.get_function_code(map_fun).co_varnames argIndex = 0 param_string = '' while arg_count > 0: param_name = arg_names[argIndex] param_val = args_dict[param_name][0] param_string += str(param_name) + '=' + str(param_val) + '&' arg_count -= 1 argIndex += 1 param_string = param_string[:-1] path_to_return = experiment_utils._get_logdir( app_id, run_id) + '/' + param_string + '/.outputs.json' if hdfs.exists(path_to_return): return_json = hdfs.load(path_to_return) return_dict = json.loads(return_json) return experiment_utils._get_logdir(app_id, run_id), return_dict else: return experiment_utils._get_logdir(app_id, run_id), None else: return experiment_utils._get_logdir(app_id, run_id), None
def _get_best(args_dict, num_combinations, arg_names, arg_count, hdfs_appid_dir, optimization_key): if not optimization_key: optimization_key = 'metric' max_hp = '' max_val = '' min_hp = '' min_val = '' min_return_dict = {} max_return_dict = {} results = [] first = True for i in range(num_combinations): argIndex = 0 param_string = '' num_args = arg_count while num_args > 0: #Get args for executor and run function param_name = arg_names[argIndex] param_val = args_dict[param_name][i] param_string += str(param_name) + '=' + str(param_val) + '&' num_args -= 1 argIndex += 1 param_string = param_string[:-1] path_to_return = hdfs_appid_dir + '/' + param_string + '/.outputs.json' assert hdfs.exists( path_to_return), 'Could not find .return file on path: {}'.format( path_to_return) with hdfs.open_file(path_to_return, flags="r") as fi: return_dict = json.loads(fi.read()) fi.close() # handle case when dict with 1 key is returned if optimization_key == 'metric' and len(return_dict.keys()) == 1: optimization_key = list(return_dict.keys())[0] metric = float(return_dict[optimization_key]) if first: max_hp = param_string max_val = metric max_return_dict = return_dict min_hp = param_string min_val = metric min_return_dict = return_dict first = False if metric > max_val: max_val = metric max_hp = param_string max_return_dict = return_dict if metric < min_val: min_val = metric min_hp = param_string min_return_dict = return_dict results.append(metric) avg = sum(results) / float(len(results)) return max_val, max_hp, min_val, min_hp, avg, max_return_dict, min_return_dict
def export(model_path, model_name, model_version=None, overwrite=False, metrics=None, description=None, synchronous=True, synchronous_timeout=120): """ Copies a trained model to the Models directory in the project and creates the directory structure of: >>> Models >>> | >>> - model_name >>> | >>> - version_x >>> | >>> - version_y For example if you run this: >>> from hops import model >>> model.export("iris_knn.pkl", "irisFlowerClassifier", metrics={'accuracy': accuracy}) It will copy the local model file "iris_knn.pkl" to /Projects/projectname/Models/irisFlowerClassifier/1/iris.knn.pkl on HDFS, and overwrite in case there already exists a file with the same name in the directory. If "model" is a directory on the local path exported by TensorFlow, and you run: >>> model.export("/model", "mnist", metrics={'accuracy': accuracy, 'loss': loss}) It will copy the model directory contents to /Projects/projectname/Models/mnist/1/ , e.g the "model.pb" file and the "variables" directory. Args: :model_path: path to the trained model (HDFS or local) :model_name: name of the model :model_version: version of the model :overwrite: boolean flag whether to overwrite in case a model already exists in the exported directory :metrics: dict of evaluation metrics to attach to model :description: description about the model :synchronous: whether to synchronously wait for the model to be indexed in the models rest endpoint :synchronous_timeout: max timeout in seconds for waiting for the model to be indexed Returns: The path to where the model was exported Raises: :ValueError: if there was an error with th of the model due to invalid user input :ModelNotFound: if the model was not found """ # Make sure model name is a string, users could supply numbers model_name = str(model_name) if not isinstance(model_path, string_types): model_path = model_path.decode() if not description: description = 'A collection of models for ' + model_name project_path = hdfs.project_path() assert hdfs.exists(project_path + "Models"), "Your project is missing a dataset named Models, please create it." if not hdfs.exists(model_path) and not os.path.exists(model_path): raise ValueError("the provided model_path: {} , does not exist in HDFS or on the local filesystem".format( model_path)) # make sure metrics are numbers if metrics: _validate_metadata(metrics) model_dir_hdfs = project_path + constants.MODEL_SERVING.MODELS_DATASET + \ constants.DELIMITERS.SLASH_DELIMITER + model_name + constants.DELIMITERS.SLASH_DELIMITER if not hdfs.exists(model_dir_hdfs): hdfs.mkdir(model_dir_hdfs) hdfs.chmod(model_dir_hdfs, "ug+rwx") # User did not specify model_version, pick the current highest version + 1, set to 1 if no model exists version_list = [] if not model_version and hdfs.exists(model_dir_hdfs): model_version_directories = hdfs.ls(model_dir_hdfs) for version_dir in model_version_directories: try: if hdfs.isdir(version_dir): version_list.append(int(version_dir[len(model_dir_hdfs):])) except: pass if len(version_list) > 0: model_version = max(version_list) + 1 if not model_version: model_version = 1 # Path to directory in HDFS to put the model files model_version_dir_hdfs = model_dir_hdfs + str(model_version) # If version directory already exists and we are not overwriting it then fail if not overwrite and hdfs.exists(model_version_dir_hdfs): raise ValueError("Could not create model directory: {}, the path already exists, " "set flag overwrite=True " "to remove the version directory and create the correct directory structure".format(model_version_dir_hdfs)) # Overwrite version directory by deleting all content (this is needed for Provenance to register Model as deleted) if overwrite and hdfs.exists(model_version_dir_hdfs): hdfs.delete(model_version_dir_hdfs, recursive=True) hdfs.mkdir(model_version_dir_hdfs) # At this point we can create the version directory if it does not exists if not hdfs.exists(model_version_dir_hdfs): hdfs.mkdir(model_version_dir_hdfs) # Export the model files if os.path.exists(model_path): export_dir=_export_local_model(model_path, model_version_dir_hdfs, overwrite) else: export_dir=_export_hdfs_model(model_path, model_version_dir_hdfs, overwrite) print("Exported model " + model_name + " as version " + str(model_version) + " successfully.") jobName=None if constants.ENV_VARIABLES.JOB_NAME_ENV_VAR in os.environ: jobName = os.environ[constants.ENV_VARIABLES.JOB_NAME_ENV_VAR] kernelId=None if constants.ENV_VARIABLES.KERNEL_ID_ENV_VAR in os.environ: kernelId = os.environ[constants.ENV_VARIABLES.KERNEL_ID_ENV_VAR] # Attach modelName_modelVersion to experiment directory model_summary = {'name': model_name, 'version': model_version, 'metrics': metrics, 'experimentId': None, 'description': description, 'jobName': jobName, 'kernelId': kernelId} if 'ML_ID' in os.environ: # Attach link from experiment to model experiment_utils._attach_model_link_xattr(os.environ['ML_ID'], model_name + '_' + str(model_version)) # Attach model metadata to models version folder model_summary['experimentId'] = os.environ['ML_ID'] experiment_utils._attach_model_xattr(model_name + "_" + str(model_version), experiment_utils.dumps(model_summary)) else: experiment_utils._attach_model_xattr(model_name + "_" + str(model_version), experiment_utils.dumps(model_summary)) # Model metadata is attached asynchronously by Epipe, therefore this necessary to ensure following steps in a pipeline will not fail if synchronous: start_time = time.time() sleep_seconds = 5 for i in range(int(synchronous_timeout/sleep_seconds)): try: time.sleep(sleep_seconds) print("Polling " + model_name + " version " + str(model_version) + " for model availability.") resp = get_model(model_name, model_version) if resp.ok: print("Model now available.") return print(model_name + " not ready yet, retrying in " + str(sleep_seconds) + " seconds.") except ModelNotFound: pass print("Model not available during polling, set a higher value for synchronous_timeout to wait longer.") return export_dir
def _wrapper_fun(iter): """ Wraps the user supplied training function in order to be passed to the Spark Executors. Args: iter: Returns: """ experiment_utils._set_ml_id(app_id, run_id) # get task context information to determine executor identifier partition_id, task_attempt = util.get_partition_attempt_id() client = rpc.Client(server_addr, partition_id, task_attempt, hb_interval, secret) log_file = (log_dir + "/executor_" + str(partition_id) + "_" + str(task_attempt) + ".log") # save the builtin print original_print = __builtin__.print reporter = Reporter(log_file, partition_id, task_attempt, original_print) def maggy_print(*args, **kwargs): """Maggy custom print() function.""" original_print(*args, **kwargs) reporter.log(" ".join(str(x) for x in args), True) # override the builtin print __builtin__.print = maggy_print try: client_addr = client.client_addr host_port = client_addr[0] + ":" + str(client_addr[1]) exec_spec = {} exec_spec["partition_id"] = partition_id exec_spec["task_attempt"] = task_attempt exec_spec["host_port"] = host_port exec_spec["trial_id"] = None reporter.log("Registering with experiment driver", False) client.register(exec_spec) client.start_heartbeat(reporter) # blocking trial_id, parameters = client.get_suggestion(reporter) while not client.done: if experiment_type == "ablation": ablation_params = { "ablated_feature": parameters.get("ablated_feature", "None"), "ablated_layer": parameters.get("ablated_layer", "None"), } parameters.pop("ablated_feature") parameters.pop("ablated_layer") tb_logdir = log_dir + "/" + trial_id trial_log_file = tb_logdir + "/output.log" reporter.set_trial_id(trial_id) # If trial is repeated, delete trial directory, except log file if hopshdfs.exists(tb_logdir): util._clean_dir(tb_logdir, [trial_log_file]) else: hopshdfs.mkdir(tb_logdir) reporter.init_logger(trial_log_file) tensorboard._register(tb_logdir) if experiment_type == "ablation": hopshdfs.dump( json.dumps(ablation_params, default=util.json_default_numpy), tb_logdir + "/.hparams.json", ) else: hopshdfs.dump( json.dumps(parameters, default=util.json_default_numpy), tb_logdir + "/.hparams.json", ) try: reporter.log("Starting Trial: {}".format(trial_id), False) reporter.log("Trial Configuration: {}".format(parameters), False) if experiment_type == "optimization": tensorboard._write_hparams(parameters, trial_id) sig = inspect.signature(map_fun) if sig.parameters.get("reporter", None): retval = map_fun(**parameters, reporter=reporter) else: retval = map_fun(**parameters) if experiment_type == "optimization": tensorboard._write_session_end() retval = util._handle_return_val(retval, tb_logdir, optimization_key, trial_log_file) except exceptions.EarlyStopException as e: retval = e.metric reporter.log("Early Stopped Trial.", False) reporter.log("Finished Trial: {}".format(trial_id), False) reporter.log("Final Metric: {}".format(retval), False) client.finalize_metric(retval, reporter) # blocking trial_id, parameters = client.get_suggestion(reporter) except: # noqa: E722 reporter.log(traceback.format_exc(), False) raise finally: reporter.close_logger() client.stop() client.close()
def __init__(self, experiment_type, **kwargs): global driver_secret # COMMON EXPERIMENT SETUP self._final_store = [] self._trial_store = {} self.num_executors = kwargs.get("num_executors") self._message_q = queue.Queue() self.name = kwargs.get("name") self.experiment_done = False self.worker_done = False self.hb_interval = kwargs.get("hb_interval") self.description = kwargs.get("description") self.experiment_type = experiment_type self.es_interval = kwargs.get("es_interval") self.es_min = kwargs.get("es_min") # TYPE-SPECIFIC EXPERIMENT SETUP if self.experiment_type == "optimization": # set up an optimization experiment self.num_trials = kwargs.get("num_trials", 1) searchspace = kwargs.get("searchspace") if isinstance(searchspace, Searchspace): self.searchspace = searchspace elif searchspace is None: self.searchspace = Searchspace() else: raise Exception( "The experiment's search space should be an instance of maggy.Searchspace, " "but it is {0} (of type '{1}').".format( str(searchspace), type(searchspace).__name__)) optimizer = kwargs.get("optimizer") if optimizer is None: if len(self.searchspace.names()) == 0: self.optimizer = SingleRun() else: raise Exception( "Searchspace has to be empty or None to use without optimizer" ) elif isinstance(optimizer, str): if optimizer.lower() == "randomsearch": self.optimizer = RandomSearch() elif optimizer.lower() == "asha": self.optimizer = Asha() elif optimizer.lower() == "none": if len(self.searchspace.names()) == 0: self.optimizer = SingleRun() else: raise Exception( "Searchspace has to be empty or None to use without Optimizer." ) else: raise Exception( "Unknown Optimizer. Can't initialize experiment driver." ) elif isinstance(optimizer, AbstractOptimizer): self.optimizer = optimizer print("Custom Optimizer initialized.") else: raise Exception( "The experiment's optimizer should either be an string indicating the name " "of an implemented optimizer (such as 'randomsearch') or an instance of " "maggy.optimizer.AbstractOptimizer, " "but it is {0} (of type '{1}').".format( str(optimizer), type(optimizer).__name__)) direction = kwargs.get("direction", "max") if isinstance(direction, str) and direction.lower() in ["min", "max"]: self.direction = direction.lower() else: raise Exception( "The experiment's direction should be an string (either 'min' or 'max') " "but it is {0} (of type '{1}').".format( str(direction), type(direction).__name__)) es_policy = kwargs.get("es_policy") if isinstance(es_policy, str): if es_policy.lower() == "median": self.earlystop_check = MedianStoppingRule.earlystop_check elif es_policy.lower() == "none": self.earlystop_check = NoStoppingRule.earlystop_check else: raise Exception( "The experiment's early stopping policy should either be a string ('median' or 'none') " "or a custom policy that is an instance of maggy.earlystop.AbstractEarlyStop, " "but it is {0} (of type '{1}').".format( str(es_policy), type(es_policy).__name__)) elif isinstance(es_policy, AbstractEarlyStop): self.earlystop_check = es_policy.earlystop_check print("Custom Early Stopping policy initialized.") else: raise Exception( "The experiment's early stopping policy should either be a string ('median' or 'none') " "or a custom policy that is an instance of maggy.earlystop.AbstractEarlyStop, " "but it is {0} (of type '{1}').".format( str(es_policy), type(es_policy).__name__)) self.es_interval = kwargs.get("es_interval") self.es_min = kwargs.get("es_min") self.result = { "best_val": "n.a.", "num_trials": 0, "early_stopped": 0 } elif self.experiment_type == "ablation": # set up an ablation study experiment self.earlystop_check = NoStoppingRule.earlystop_check ablation_study = kwargs.get("ablation_study") if isinstance(ablation_study, AblationStudy): self.ablation_study = ablation_study else: raise Exception( "The experiment's ablation study configuration should be an instance of " "maggy.ablation.AblationStudy, " "but it is {0} (of type '{1}').".format( str(ablation_study), type(ablation_study).__name__)) searchspace = kwargs.get("searchspace") if not searchspace: self.searchspace = Searchspace() else: raise Exception( "The experiment's search space should be None for ablation experiments, " "but it is {0} (of type '{1}').".format( str(searchspace), type(searchspace).__name__)) ablator = kwargs.get("ablator") if isinstance(ablator, str): if ablator.lower() == "loco": self.ablator = LOCO(ablation_study, self._final_store) self.num_trials = self.ablator.get_number_of_trials() if self.num_executors > self.num_trials: self.num_executors = self.num_trials else: raise Exception( "The experiment's ablation study policy should either be a string ('loco') " "or a custom policy that is an instance of maggy.ablation.ablation.AbstractAblator, " "but it is {0} (of type '{1}').".format( str(ablator), type(ablator).__name__)) elif isinstance(ablator, AbstractAblator): self.ablator = ablator print("Custom Ablator initialized. \n") else: raise Exception( "The experiment's ablation study policy should either be a string ('loco') " "or a custom policy that is an instance of maggy.ablation.ablation.AbstractAblator, " "but it is {0} (of type '{1}').".format( str(ablator), type(ablator).__name__)) self.result = { "best_val": "n.a.", "num_trials": 0, "early_stopped": "n.a" } else: raise Exception( "Unknown experiment type. experiment_type should be either 'optimization' or 'ablation', " "but it is {0}.".format(str(self.experiment_type))) # FINALIZE EXPERIMENT SETUP self.server = rpc.Server(self.num_executors) if not driver_secret: driver_secret = self._generate_secret( ExperimentDriver.SECRET_BYTES) self._secret = driver_secret self.job_start = datetime.now() self.executor_logs = "" self.maggy_log = "" self.log_lock = threading.RLock() self.log_file = kwargs.get("log_dir") + "/maggy.log" self.log_dir = kwargs.get("log_dir") self.exception = None # Open File desc for HDFS to log if not hopshdfs.exists(self.log_file): hopshdfs.dump("", self.log_file) self.fd = hopshdfs.open_file(self.log_file, flags="w")
def exists(self, hdfs_path, project=None): return hopshdfs.exists(hdfs_path, project=project)