def _load_hparams(hparams_file): """Loads the HParams configuration from a hparams file of a trial. """ hparams_file_contents = hopshdfs.load(hparams_file) hparams = json.loads(hparams_file_contents) return hparams
def read_featureframe(self, spark): """ Reads a training dataset in hdf5 format from HopsFS Args: :spark: the spark session Returns: dataframe with the data of the training dataset Raises: :TrainingDatasetNotFound: if the requested training dataset could not be found :CouldNotConvertDataframe: if the hdf5 dataset could not be converted to a spark dataframe :HDF5DatasetFormatNotSupportedForExternalTrainingDatasets: if the user tries to read an external training dataset in the .hdf5 format. """ if not hasattr(self, 'training_dataset') or \ self.training_dataset.training_dataset_type \ == constants.REST_CONFIG.JSON_TRAINING_DATASET_EXTERNAL_TYPE: raise HDF5DatasetFormatNotSupportedForExternalTrainingDatasets( "The .hdf5 dataset format is not " "supported for external training datasets.") if not hdfs.exists( self.path + constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX): raise TrainingDatasetNotFound( "Could not find a training dataset in file {}".format( self.path + constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX)) tf = TemporaryFile() data = hdfs.load(self.path + constants.FEATURE_STORE.TRAINING_DATASET_HDF5_SUFFIX) tf.write(data) tf.seek(0) hdf5_file = h5py.File(tf) np_array = hdf5_file[self.training_dataset.name][()] if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_NUMPY: return np_array if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PYTHON: return np_array.tolist() if self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_SPARK \ or self.dataframe_type == constants.FEATURE_STORE.DATAFRAME_TYPE_PANDAS: if np_array.ndim != 2: raise CouldNotConvertDataframe( "Cannot convert numpy array that do not have two dimensions to a dataframe. " "The number of dimensions are: {}".format(np_array.ndim)) num_cols = np_array.shape[1] dataframe_dict = {} for n_col in list(range(num_cols)): col_name = "col_" + str(n_col) dataframe_dict[col_name] = np_array[:, n_col] pandas_df = pd.DataFrame(dataframe_dict) sc = spark.sparkContext sql_context = SQLContext(sc) return fs_utils._return_dataframe_type( sql_context.createDataFrame(pandas_df), self.dataframe_type)
def parse_input_json(hdfs_path): """ Parse input JSON command line arguments for the util job Args: :hdfs_path: path to the JSON input on HDFS Returns: The parsed JSON (dict) """ return json.loads(hdfs.load(hdfs_path))
def _convert_return_file_to_arr(return_file_path): return_file_contents = hdfs.load(return_file_path) # Could be a number try: metric = int(return_file_contents) return [{'metric': metric}] except: pass return_json = json.loads(return_file_contents) metric_dict = {} for metric_key in return_json: metric_dict[metric_key] = return_json[metric_key] return metric_dict
def _get_params_dict(best_dir): """ Utiliy method for converting best_param string to dict Args: :best_param: the best_param string Returns: a dict with param->value """ params_json = hdfs.load(best_dir + '/.hparams.json') params_dict = json.loads(params_json) return params_dict
def _get_best(root_logdir, direction): min_val = sys.float_info.max min_logdir = None max_val = sys.float_info.min max_logdir = None generation_folders = hdfs.ls(root_logdir) generation_folders.sort() for generation in generation_folders: for individual in hdfs.ls(generation): invidual_files = hdfs.ls(individual, recursive=True) for file in invidual_files: if file.endswith("/.metric"): val = hdfs.load(file) val = float(val) if val > max_val: max_val = val max_logdir = file[:-8] if val < min_val: min_val = val min_logdir = file[:-8] if direction.upper() == Direction.MAX: return_dict = {} with hdfs.open_file(max_logdir + '/.outputs.json', flags="r") as fi: return_dict = json.loads(fi.read()) fi.close() return max_logdir, return_dict else: return_dict = {} with hdfs.open_file(min_logdir + '/.outputs.json', flags="r") as fi: return_dict = json.loads(fi.read()) fi.close() return min_logdir, return_dict
def _get_return_file(param_string, app_id, generation_id, run_id): """ Args: :param_string: :app_id: :generation_id: :run_id: Returns: """ handle = hdfs.get() for i in range(generation_id): possible_result_path = experiment_utils._get_experiments_dir() + '/' + app_id + '_' \ + str(run_id) + '/generation.' + str(i) + '/' + param_string + '/.outputs.json' if handle.exists(possible_result_path): return_file_contents = hdfs.load(possible_result_path) return return_file_contents return None
def _convert_param_to_arr(params_file): params = hdfs.load(params_file) params_dict = json.loads(params) return params_dict
def load(self, hparams_file): return hopshdfs.load(hparams_file)
def _run(sc, map_fun, run_id, args_dict=None, local_logdir=False, name="no-name"): """ Args: sc: map_fun: args_dict: local_logdir: name: Returns: """ app_id = str(sc.applicationId) if args_dict == None: num_executions = 1 else: arg_lists = list(args_dict.values()) currentLen = len(arg_lists[0]) for i in range(len(arg_lists)): if currentLen != len(arg_lists[i]): raise ValueError( 'Length of each function argument list must be equal') num_executions = len(arg_lists[i]) sc.setJobGroup(os.environ['ML_ID'], "{} | Launcher running experiment".format(name)) #Each TF task should be run on 1 executor nodeRDD = sc.parallelize(range(num_executions), num_executions) #Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir)) print('Finished Experiment \n') # For single run return .return if exists if args_dict == None: path_to_return = experiment_utils._get_logdir( app_id, run_id) + '/.outputs.json' if hdfs.exists(path_to_return): return_json = hdfs.load(path_to_return) return_dict = json.loads(return_json) return experiment_utils._get_logdir(app_id, run_id), return_dict else: return experiment_utils._get_logdir(app_id, run_id), None elif num_executions == 1: arg_count = six.get_function_code(map_fun).co_argcount arg_names = six.get_function_code(map_fun).co_varnames argIndex = 0 param_string = '' while arg_count > 0: param_name = arg_names[argIndex] param_val = args_dict[param_name][0] param_string += str(param_name) + '=' + str(param_val) + '&' arg_count -= 1 argIndex += 1 param_string = param_string[:-1] path_to_return = experiment_utils._get_logdir( app_id, run_id) + '/' + param_string + '/.outputs.json' if hdfs.exists(path_to_return): return_json = hdfs.load(path_to_return) return_dict = json.loads(return_json) return experiment_utils._get_logdir(app_id, run_id), return_dict else: return experiment_utils._get_logdir(app_id, run_id), None else: return experiment_utils._get_logdir(app_id, run_id), None