def _create_experiment_dir(app_id, run_id): experiment_path = _get_logdir(app_id, run_id) if hdfs.exists(experiment_path): hdfs.delete(experiment_path, recursive=True) hdfs.mkdir(experiment_path)
def mirrored(map_fun, name='no-name', local_logdir=False, description=None, evaluator=False): """ *Distributed Training* Example usage: >>> from hops import experiment >>> def mirrored_training(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from hops import tensorboard >>> from hops import devices >>> logdir = tensorboard.logdir() >>> ...MirroredStrategy()... >>> experiment.mirrored(mirrored_training, local_logdir=True) Args: :map_fun: contains the code where you are using MirroredStrategy. :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :evaluator: whether to run one of the workers as an evaluator Returns: HDFS path in your project where the experiment is stored and return value from the process running as chief """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") num_workers = util.num_executors() if evaluator: assert num_workers > 2, "number of workers must be atleast 3 if evaluator is set to True" start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = experiment_utils._populate_experiment( name, 'mirrored', 'DISTRIBUTED_TRAINING', None, description, app_id, None, None) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, return_dict = mirrored_impl._run(sc, map_fun, run_id, local_logdir=local_logdir, name=name, evaluator=evaluator) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment(experiment_json, None, app_id, run_id, 'FINISHED', duration, logdir, None, None) return logdir, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def collective_all_reduce(map_fun, name='no-name', local_logdir=False, description=None, evaluator=False): """ *Distributed Training* Sets up the cluster to run CollectiveAllReduceStrategy. TF_CONFIG is exported in the background and does not need to be set by the user themselves. Example usage: >>> from hops import experiment >>> def distributed_training(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from hops import tensorboard >>> from hops import devices >>> logdir = tensorboard.logdir() >>> ...CollectiveAllReduceStrategy(num_gpus_per_worker=devices.get_num_gpus())... >>> experiment.collective_all_reduce(distributed_training, local_logdir=True) Args: :map_fun: the function containing code to run CollectiveAllReduceStrategy :name: the name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :evaluator: whether to run one of the workers as an evaluator Returns: HDFS path in your project where the experiment is stored and return value from the process running as chief """ num_ps = util.num_param_servers() num_executors = util.num_executors() assert num_ps == 0, "number of parameter servers should be 0" assert num_executors > 1, "number of workers (executors) should be greater than 1" if evaluator: assert num_executors > 2, "number of workers must be atleast 3 if evaluator is set to True" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = experiment_utils._populate_experiment( name, 'collective_all_reduce', 'DISTRIBUTED_TRAINING', None, description, app_id, None, None) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, return_dict = allreduce_impl._run(sc, map_fun, run_id, local_logdir=local_logdir, name=name, evaluator=evaluator) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment(experiment_json, None, app_id, run_id, 'FINISHED', duration, logdir, None, None) return logdir, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def grid_search(map_fun, grid_dict, direction=Direction.MAX, name='no-name', local_logdir=False, description=None, optimization_key='metric'): """ *Parallel Experiment* Run grid search evolution to explore a predefined set of hyperparameter combinations. The function is treated as a blackbox that returns a metric for some given hyperparameter combination. The returned metric is used to evaluate how 'good' the hyperparameter combination was. Example usage: >>> from hops import experiment >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> return network.evaluate(learning_rate, layers, dropout) >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX) Returning multiple outputs, including images and logs: >>> from hops import experiment >>> grid_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> # Important! Remember: optimization_key must be set when returning multiple outputs >>> experiment.grid_search(train_nn, grid_dict, direction=Direction.MAX, optimization_key='accuracy') Args: :map_fun: the function to run, must return a metric :grid_dict: a dict with a key for each argument with a corresponding value being a list containing the hyperparameters to test, internally all possible combinations will be generated and run as separate Experiments :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value Returns: HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = experiment_utils._populate_experiment( name, 'grid_search', 'PARALLEL_EXPERIMENTS', json.dumps(grid_dict), description, app_id, direction, optimization_key) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') grid_params = experiment_utils.grid_params(grid_dict) logdir, best_param, best_metric, return_dict = grid_search_impl._run( sc, map_fun, run_id, grid_params, direction=direction, local_logdir=local_logdir, name=name, optimization_key=optimization_key) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment( experiment_json, best_metric, app_id, run_id, 'FINISHED', duration, experiment_utils._get_logdir(app_id, run_id), logdir, optimization_key) return logdir, best_param, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def launch(map_fun, args_dict=None, name='no-name', local_logdir=False, description=None, metric_key=None): """ *Experiment* or *Parallel Experiment* Run an Experiment contained in *map_fun* one time with no arguments or multiple times with different arguments if *args_dict* is specified. Example usage: >>> from hops import experiment >>> def train_nn(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> experiment.launch(train_nn) Returning multiple outputs, including images and logs: >>> from hops import experiment >>> def train_nn(): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> experiment.launch(train_nn) Args: :map_fun: The function to run :args_dict: If specified will run the same function multiple times with different arguments, {'a':[1,2], 'b':[5,3]} would run the function two times with arguments (1,5) and (2,3) provided that the function signature contains two arguments like *def func(a,b):* :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: A longer description for the experiment :metric_key: If returning a dict with multiple return values, this key should match the name of the key in the dict for the metric you want to associate with the experiment Returns: HDFS path in your project where the experiment is stored """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError( "An experiment is currently running. Please call experiment.end() to stop it." ) start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = None if args_dict: experiment_json = experiment_utils._populate_experiment( name, 'launch', 'EXPERIMENT', json.dumps(args_dict), description, app_id, None, None) else: experiment_json = experiment_utils._populate_experiment( name, 'launch', 'EXPERIMENT', None, description, app_id, None, None) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, return_dict = launcher._run(sc, map_fun, run_id, args_dict, local_logdir) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) metric = experiment_utils._get_metric(return_dict, metric_key) experiment_utils._finalize_experiment(experiment_json, metric, app_id, run_id, 'FINISHED', duration, logdir, None, None) return logdir, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def differential_evolution(objective_function, boundary_dict, direction=Direction.MAX, generations=4, population=6, mutation=0.5, crossover=0.7, name='no-name', local_logdir=False, description=None, optimization_key='metric'): """ *Parallel Experiment* Run differential evolution to explore a given search space for each hyperparameter and figure out the best hyperparameter combination. The function is treated as a blackbox that returns a metric for some given hyperparameter combination. The returned metric is used to evaluate how 'good' the hyperparameter combination was. Example usage: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> import tensorflow >>> return network.evaluate(learning_rate, layers, dropout) >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX) Returning multiple outputs, including images and logs: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> # Important! Remember: optimization_key must be set when returning multiple outputs >>> experiment.differential_evolution(train_nn, boundary_dict, direction=Direction.MAX, optimization_key='accuracy') Args: :objective_function: the function to run, must return a metric :boundary_dict: a dict where each key corresponds to an argument of *objective_function* and the correspond value should be a list of two elements. The first element being the lower bound for the parameter and the the second element the upper bound. :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric :generations: number of generations :population: size of population :mutation: mutation rate to explore more different hyperparameters :crossover: how fast to adapt the population to the best in each generation :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: a longer description for the experiment :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value Returns: HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() diff_evo_impl.run_id = run_id hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = experiment_utils._populate_experiment( name, 'differential_evolution', 'PARALLEL_EXPERIMENTS', json.dumps(boundary_dict), description, app_id, direction, optimization_key) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, best_param, best_metric, return_dict = diff_evo_impl._run( objective_function, boundary_dict, direction=direction, generations=generations, population=population, mutation=mutation, crossover=crossover, cleanup_generations=False, local_logdir=local_logdir, name=name, optimization_key=optimization_key) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment( experiment_json, best_metric, app_id, run_id, 'FINISHED', duration, experiment_utils._get_logdir(app_id, run_id), logdir, optimization_key) return logdir, best_param, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def random_search(map_fun, boundary_dict, direction=Direction.MAX, samples=10, name='no-name', local_logdir=False, description=None, optimization_key='metric'): """ *Parallel Experiment* Run an Experiment contained in *map_fun* for configured number of random samples controlled by the *samples* parameter. Each hyperparameter is contained in *boundary_dict* with the key corresponding to the name of the hyperparameter and a list containing two elements defining the lower and upper bound. The experiment must return a metric corresponding to how 'good' the given hyperparameter combination is. Example usage: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> return network.evaluate(learning_rate, layers, dropout) >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max') Returning multiple outputs, including images and logs: >>> from hops import experiment >>> boundary_dict = {'learning_rate': [0.1, 0.3], 'layers': [2, 9], 'dropout': [0.1,0.9]} >>> def train_nn(learning_rate, layers, dropout): >>> # Do all imports in the function >>> import tensorflow >>> # Put all code inside the wrapper function >>> from PIL import Image >>> f = open('logfile.txt', 'w') >>> f.write('Starting training...') >>> accuracy, loss = network.evaluate(learning_rate, layers, dropout) >>> img = Image.new(.....) >>> img.save('diagram.png') >>> return {'accuracy': accuracy, 'loss': loss, 'logfile': 'logfile.txt', 'diagram': 'diagram.png'} >>> # Important! Remember: optimization_key must be set when returning multiple outputs >>> experiment.differential_evolution(train_nn, boundary_dict, direction='max', optimization_key='accuracy') Args: :map_fun: The function to run :boundary_dict: dict containing hyperparameter name and corresponding boundaries, each experiment randomize a value in the boundary range. :direction: Direction.MAX to maximize the returned metric, Direction.MIN to minize the returned metric :samples: the number of random samples to evaluate for each hyperparameter given the boundaries, for example samples=3 would result in 3 hyperparameter combinations in total to evaluate :name: name of the experiment :local_logdir: True if *tensorboard.logdir()* should be in the local filesystem, otherwise it is in HDFS :description: A longer description for the experiment :optimization_key: When returning a dict, the key name of the metric to maximize or minimize in the dict should be set as this value Returns: HDFS path in your project where the experiment is stored, dict with best hyperparameters and return dict with best metrics """ num_ps = util.num_param_servers() assert num_ps == 0, "number of parameter servers should be 0" global running if running: raise RuntimeError("An experiment is currently running.") start = time.time() sc = util._find_spark().sparkContext try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) _start_run() hdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) experiment_json = experiment_utils._populate_experiment( name, 'random_search', 'PARALLEL_EXPERIMENTS', json.dumps(boundary_dict), description, app_id, direction, optimization_key) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, 'CREATE') logdir, best_param, best_metric, return_dict = r_search_impl._run( sc, map_fun, run_id, boundary_dict, samples, direction=direction, local_logdir=local_logdir, optimization_key=optimization_key) duration = experiment_utils._seconds_to_milliseconds(time.time() - start) experiment_utils._finalize_experiment( experiment_json, best_metric, app_id, run_id, 'FINISHED', duration, experiment_utils._get_logdir(app_id, run_id), logdir, optimization_key) return logdir, best_param, return_dict except: _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - start)) raise finally: _end_run(sc)
def _wrapper_fun(iter): """ Wraps the user supplied training function in order to be passed to the Spark Executors. Args: iter: Returns: """ experiment_utils._set_ml_id(app_id, run_id) # get task context information to determine executor identifier partition_id, task_attempt = util.get_partition_attempt_id() client = rpc.Client(server_addr, partition_id, task_attempt, hb_interval, secret) log_file = (log_dir + "/executor_" + str(partition_id) + "_" + str(task_attempt) + ".log") # save the builtin print original_print = __builtin__.print reporter = Reporter(log_file, partition_id, task_attempt, original_print) def maggy_print(*args, **kwargs): """Maggy custom print() function.""" original_print(*args, **kwargs) reporter.log(" ".join(str(x) for x in args), True) # override the builtin print __builtin__.print = maggy_print try: client_addr = client.client_addr host_port = client_addr[0] + ":" + str(client_addr[1]) exec_spec = {} exec_spec["partition_id"] = partition_id exec_spec["task_attempt"] = task_attempt exec_spec["host_port"] = host_port exec_spec["trial_id"] = None reporter.log("Registering with experiment driver", False) client.register(exec_spec) client.start_heartbeat(reporter) # blocking trial_id, parameters = client.get_suggestion(reporter) while not client.done: if experiment_type == "ablation": ablation_params = { "ablated_feature": parameters.get("ablated_feature", "None"), "ablated_layer": parameters.get("ablated_layer", "None"), } parameters.pop("ablated_feature") parameters.pop("ablated_layer") tb_logdir = log_dir + "/" + trial_id trial_log_file = tb_logdir + "/output.log" reporter.set_trial_id(trial_id) # If trial is repeated, delete trial directory, except log file if hopshdfs.exists(tb_logdir): util._clean_dir(tb_logdir, [trial_log_file]) else: hopshdfs.mkdir(tb_logdir) reporter.init_logger(trial_log_file) tensorboard._register(tb_logdir) if experiment_type == "ablation": hopshdfs.dump( json.dumps(ablation_params, default=util.json_default_numpy), tb_logdir + "/.hparams.json", ) else: hopshdfs.dump( json.dumps(parameters, default=util.json_default_numpy), tb_logdir + "/.hparams.json", ) try: reporter.log("Starting Trial: {}".format(trial_id), False) reporter.log("Trial Configuration: {}".format(parameters), False) if experiment_type == "optimization": tensorboard._write_hparams(parameters, trial_id) sig = inspect.signature(map_fun) if sig.parameters.get("reporter", None): retval = map_fun(**parameters, reporter=reporter) else: retval = map_fun(**parameters) if experiment_type == "optimization": tensorboard._write_session_end() retval = util._handle_return_val(retval, tb_logdir, optimization_key, trial_log_file) except exceptions.EarlyStopException as e: retval = e.metric reporter.log("Early Stopped Trial.", False) reporter.log("Finished Trial: {}".format(trial_id), False) reporter.log("Final Metric: {}".format(retval), False) client.finalize_metric(retval, reporter) # blocking trial_id, parameters = client.get_suggestion(reporter) except: # noqa: E722 reporter.log(traceback.format_exc(), False) raise finally: reporter.close_logger() client.stop() client.close()
def mkdir(self, hdfs_path, project=None): return hopshdfs.mkdir(hdfs_path, project=project)
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() is_chief = False logdir = None tb_hdfs_path = None try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] client = mirrored_reservation.Client(server_addr) host_port = host + ":" + str(port) client.register({"worker": host_port, "index": executor_num}) cluster = client.await_reservations() tmp_socket.close() client.close() task_index = experiment_utils._find_index(host_port, cluster) if task_index == -1: cluster["task"] = {"type": "chief", "index": 0} else: cluster["task"] = {"type": "worker", "index": task_index} evaluator_node = None if evaluator: last_worker_index = len(cluster["cluster"]["worker"]) - 1 evaluator_node = cluster["cluster"]["worker"][ last_worker_index] cluster["cluster"]["evaluator"] = [evaluator_node] del cluster["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: cluster["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster)) if num_executors > 1: os.environ["TF_CONFIG"] = json.dumps(cluster) is_chief = (cluster["task"]["type"] == "chief") logfile = experiment_utils._init_logger( experiment_utils._get_logdir(app_id, run_id), role=cluster["task"]["type"], index=cluster["task"]["index"]) dist_logdir = experiment_utils._get_logdir(app_id, run_id) + '/logdir' if is_chief: hdfs.mkdir(dist_logdir) tensorboard._register(dist_logdir, experiment_utils._get_logdir( app_id, run_id), executor_num, local_logdir=local_logdir) else: tensorboard.events_logdir = dist_logdir print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval = train_fn() if is_chief: experiment_utils._handle_return_simple( retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff( task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def export(model_path, model_name, model_version=None, overwrite=False, metrics=None, description=None, synchronous=True, synchronous_timeout=120): """ Copies a trained model to the Models directory in the project and creates the directory structure of: >>> Models >>> | >>> - model_name >>> | >>> - version_x >>> | >>> - version_y For example if you run this: >>> from hops import model >>> model.export("iris_knn.pkl", "irisFlowerClassifier", metrics={'accuracy': accuracy}) It will copy the local model file "iris_knn.pkl" to /Projects/projectname/Models/irisFlowerClassifier/1/iris.knn.pkl on HDFS, and overwrite in case there already exists a file with the same name in the directory. If "model" is a directory on the local path exported by TensorFlow, and you run: >>> model.export("/model", "mnist", metrics={'accuracy': accuracy, 'loss': loss}) It will copy the model directory contents to /Projects/projectname/Models/mnist/1/ , e.g the "model.pb" file and the "variables" directory. Args: :model_path: path to the trained model (HDFS or local) :model_name: name of the model :model_version: version of the model :overwrite: boolean flag whether to overwrite in case a model already exists in the exported directory :metrics: dict of evaluation metrics to attach to model :description: description about the model :synchronous: whether to synchronously wait for the model to be indexed in the models rest endpoint :synchronous_timeout: max timeout in seconds for waiting for the model to be indexed Returns: The path to where the model was exported Raises: :ValueError: if there was an error with th of the model due to invalid user input :ModelNotFound: if the model was not found """ # Make sure model name is a string, users could supply numbers model_name = str(model_name) if not isinstance(model_path, string_types): model_path = model_path.decode() if not description: description = 'A collection of models for ' + model_name project_path = hdfs.project_path() assert hdfs.exists(project_path + "Models"), "Your project is missing a dataset named Models, please create it." if not hdfs.exists(model_path) and not os.path.exists(model_path): raise ValueError("the provided model_path: {} , does not exist in HDFS or on the local filesystem".format( model_path)) # make sure metrics are numbers if metrics: _validate_metadata(metrics) model_dir_hdfs = project_path + constants.MODEL_SERVING.MODELS_DATASET + \ constants.DELIMITERS.SLASH_DELIMITER + model_name + constants.DELIMITERS.SLASH_DELIMITER if not hdfs.exists(model_dir_hdfs): hdfs.mkdir(model_dir_hdfs) hdfs.chmod(model_dir_hdfs, "ug+rwx") # User did not specify model_version, pick the current highest version + 1, set to 1 if no model exists version_list = [] if not model_version and hdfs.exists(model_dir_hdfs): model_version_directories = hdfs.ls(model_dir_hdfs) for version_dir in model_version_directories: try: if hdfs.isdir(version_dir): version_list.append(int(version_dir[len(model_dir_hdfs):])) except: pass if len(version_list) > 0: model_version = max(version_list) + 1 if not model_version: model_version = 1 # Path to directory in HDFS to put the model files model_version_dir_hdfs = model_dir_hdfs + str(model_version) # If version directory already exists and we are not overwriting it then fail if not overwrite and hdfs.exists(model_version_dir_hdfs): raise ValueError("Could not create model directory: {}, the path already exists, " "set flag overwrite=True " "to remove the version directory and create the correct directory structure".format(model_version_dir_hdfs)) # Overwrite version directory by deleting all content (this is needed for Provenance to register Model as deleted) if overwrite and hdfs.exists(model_version_dir_hdfs): hdfs.delete(model_version_dir_hdfs, recursive=True) hdfs.mkdir(model_version_dir_hdfs) # At this point we can create the version directory if it does not exists if not hdfs.exists(model_version_dir_hdfs): hdfs.mkdir(model_version_dir_hdfs) # Export the model files if os.path.exists(model_path): export_dir=_export_local_model(model_path, model_version_dir_hdfs, overwrite) else: export_dir=_export_hdfs_model(model_path, model_version_dir_hdfs, overwrite) print("Exported model " + model_name + " as version " + str(model_version) + " successfully.") jobName=None if constants.ENV_VARIABLES.JOB_NAME_ENV_VAR in os.environ: jobName = os.environ[constants.ENV_VARIABLES.JOB_NAME_ENV_VAR] kernelId=None if constants.ENV_VARIABLES.KERNEL_ID_ENV_VAR in os.environ: kernelId = os.environ[constants.ENV_VARIABLES.KERNEL_ID_ENV_VAR] # Attach modelName_modelVersion to experiment directory model_summary = {'name': model_name, 'version': model_version, 'metrics': metrics, 'experimentId': None, 'description': description, 'jobName': jobName, 'kernelId': kernelId} if 'ML_ID' in os.environ: # Attach link from experiment to model experiment_utils._attach_model_link_xattr(os.environ['ML_ID'], model_name + '_' + str(model_version)) # Attach model metadata to models version folder model_summary['experimentId'] = os.environ['ML_ID'] experiment_utils._attach_model_xattr(model_name + "_" + str(model_version), experiment_utils.dumps(model_summary)) else: experiment_utils._attach_model_xattr(model_name + "_" + str(model_version), experiment_utils.dumps(model_summary)) # Model metadata is attached asynchronously by Epipe, therefore this necessary to ensure following steps in a pipeline will not fail if synchronous: start_time = time.time() sleep_seconds = 5 for i in range(int(synchronous_timeout/sleep_seconds)): try: time.sleep(sleep_seconds) print("Polling " + model_name + " version " + str(model_version) + " for model availability.") resp = get_model(model_name, model_version) if resp.ok: print("Model now available.") return print(model_name + " not ready yet, retrying in " + str(sleep_seconds) + " seconds.") except ModelNotFound: pass print("Model not available during polling, set a higher value for synchronous_timeout to wait longer.") return export_dir
def export(model_path, model_name, model_version=1, overwrite=False): """ Copies a trained model to the Models directory in the project and creates the directory structure of: >>> Models >>> | >>> - model_name >>> | >>> - version_x >>> | >>> - version_y For example if you run this: >>> serving.export("iris_knn.pkl", "irisFlowerClassifier", 1, overwrite=True) it will copy the local model file "iris_knn.pkl" to /Projects/projectname/Models/irisFlowerClassifier/1/iris.knn.pkl on HDFS, and overwrite in case there already exists a file with the same name in the directory. If you run: >>> serving.export("Resources/iris_knn.pkl", "irisFlowerClassifier", 1, overwrite=True) it will first check if the path Resources/iris_knn.pkl exists on your local filesystem in the current working directory. If the path was not found, it will check in your project's HDFS directory and if it finds the model there it will copy it to /Projects/projectname/Models/irisFlowerClassifier/1/iris.knn.pkl If "model" is a directory on the local path exported by tensorflow, and you run: : >>> serving.export("/model/", "mnist", 1, overwrite=True) It will copy the model directory contents to /Projects/projectname/Models/mnist/1/ , e.g the "model.pb" file and the "variables" directory. Args: :model_path: path to the trained model (HDFS or local) :model_name: name of the model/serving :model_version: version of the model/serving :overwrite: boolean flag whether to overwrite in case a serving already exists in the exported directory Returns: The path to where the model was exported Raises: :ValueError: if there was an error with the exportation of the model due to invalid user input """ if not hdfs.exists(model_path) and not os.path.exists(model_path): raise ValueError("the provided model_path: {} , does not exist in HDFS or on the local filesystem".format( model_path)) # Create directory in HDFS to put the model files project_path = hdfs.project_path() model_dir_hdfs = project_path + constants.MODEL_SERVING.MODELS_DATASET + \ constants.DELIMITERS.SLASH_DELIMITER + str(model_name) + \ constants.DELIMITERS.SLASH_DELIMITER + str(model_version) + \ constants.DELIMITERS.SLASH_DELIMITER if not hdfs.exists(model_dir_hdfs): hdfs.mkdir(model_dir_hdfs) if (not overwrite) and hdfs.exists(model_dir_hdfs) and hdfs.isfile(model_dir_hdfs): raise ValueError("Could not create model directory: {}, the path already exists and is a file, " "set flag overwrite=True " "to remove the file and create the correct directory structure".format(model_dir_hdfs)) if overwrite and hdfs.exists(model_dir_hdfs) and hdfs.isfile(model_dir_hdfs): hdfs.delete(model_dir_hdfs) hdfs.mkdir(model_dir_hdfs) # Export the model files if os.path.exists(model_path): return _export_local_model(model_path, model_dir_hdfs, overwrite) else: return _export_hdfs_model(model_path, model_dir_hdfs, overwrite)
def lagom( map_fun, name="no-name", experiment_type="optimization", searchspace=None, optimizer=None, direction="max", num_trials=1, ablation_study=None, ablator=None, optimization_key="metric", hb_interval=1, es_policy="median", es_interval=300, es_min=10, description="", ): """Launches a maggy experiment, which depending on `experiment_type` can either be a hyperparameter optimization or an ablation study experiment. Given a search space, objective and a model training procedure `map_fun` (black-box function), an experiment is the whole process of finding the best hyperparameter combination in the search space, optimizing the black-box function. Currently maggy supports random search and a median stopping rule. **lagom** is a Swedish word meaning "just the right amount". :param map_fun: User defined experiment containing the model training. :type map_fun: function :param name: A user defined experiment identifier. :type name: str :param experiment_type: Type of Maggy experiment, either 'optimization' (default) or 'ablation'. :type experiment_type: str :param searchspace: A maggy Searchspace object from which samples are drawn. :type searchspace: Searchspace :param optimizer: The optimizer is the part generating new trials. :type optimizer: str, AbstractOptimizer :param direction: If set to ‘max’ the highest value returned will correspond to the best solution, if set to ‘min’ the opposite is true. :type direction: str :param num_trials: the number of trials to evaluate given the search space, each containing a different hyperparameter combination :type num_trials: int :param ablation_study: Ablation study object. Can be None for optimization experiment type. :type ablation_study: AblationStudy :param ablator: Ablator to use for experiment type 'ablation'. :type ablator: str, AbstractAblator :param optimization_key: Name of the metric to be optimized :type optimization_key: str, optional :param hb_interval: The heartbeat interval in seconds from trial executor to experiment driver, defaults to 1 :type hb_interval: int, optional :param es_policy: The earlystopping policy, defaults to 'median' :type es_policy: str, optional :param es_interval: Frequency interval in seconds to check currently running trials for early stopping, defaults to 300 :type es_interval: int, optional :param es_min: Minimum number of trials finalized before checking for early stopping, defaults to 10 :type es_min: int, optional :param description: A longer description of the experiment. :type description: str, optional :raises RuntimeError: An experiment is currently running. :return: A dictionary indicating the best trial and best hyperparameter combination with it's performance metric :rtype: dict """ global running if running: raise RuntimeError("An experiment is currently running.") job_start = time.time() sc = hopsutil._find_spark().sparkContext exp_driver = None try: global app_id global experiment_json global run_id app_id = str(sc.applicationId) app_id, run_id = util._validate_ml_id(app_id, run_id) # start run running = True experiment_utils._set_ml_id(app_id, run_id) # create experiment dir hopshdfs.mkdir(experiment_utils._get_logdir(app_id, run_id)) tensorboard._register(experiment_utils._get_logdir(app_id, run_id)) num_executors = util.num_executors(sc) # start experiment driver if experiment_type == "optimization": assert num_trials > 0, "number of trials should be greater " + "than zero" tensorboard._write_hparams_config( experiment_utils._get_logdir(app_id, run_id), searchspace) if num_executors > num_trials: num_executors = num_trials exp_driver = ExperimentDriver( "optimization", searchspace=searchspace, optimizer=optimizer, direction=direction, num_trials=num_trials, name=name, num_executors=num_executors, hb_interval=hb_interval, es_policy=es_policy, es_interval=es_interval, es_min=es_min, description=description, log_dir=experiment_utils._get_logdir(app_id, run_id), ) exp_function = exp_driver.optimizer.name() elif experiment_type == "ablation": exp_driver = ExperimentDriver( "ablation", ablation_study=ablation_study, ablator=ablator, name=name, num_executors=num_executors, hb_interval=hb_interval, description=description, log_dir=experiment_utils._get_logdir(app_id, run_id), ) # using exp_driver.num_executor since # it has been set using ablator.get_number_of_trials() # in experiment.py if num_executors > exp_driver.num_executors: num_executors = exp_driver.num_executors exp_function = exp_driver.ablator.name() else: running = False raise RuntimeError( "Unknown experiment_type:" "should be either 'optimization' or 'ablation', " "But it is '{0}'".format(str(experiment_type))) nodeRDD = sc.parallelize(range(num_executors), num_executors) # Do provenance after initializing exp_driver, because exp_driver does # the type checks for optimizer and searchspace sc.setJobGroup(os.environ["ML_ID"], "{0} | {1}".format(name, exp_function)) experiment_json = experiment_utils._populate_experiment( name, exp_function, "MAGGY", exp_driver.searchspace.json(), description, app_id, direction, optimization_key, ) experiment_json = experiment_utils._attach_experiment_xattr( app_id, run_id, experiment_json, "CREATE") util._log("Started Maggy Experiment: {0}, {1}, run {2}".format( name, app_id, run_id)) exp_driver.init(job_start) server_addr = exp_driver.server_addr # Force execution on executor, since GPU is located on executor nodeRDD.foreachPartition( trialexecutor._prepare_func( app_id, run_id, experiment_type, map_fun, server_addr, hb_interval, exp_driver._secret, optimization_key, experiment_utils._get_logdir(app_id, run_id), )) job_end = time.time() result = exp_driver.finalize(job_end) best_logdir = (experiment_utils._get_logdir(app_id, run_id) + "/" + result["best_id"]) util._finalize_experiment( experiment_json, float(result["best_val"]), app_id, run_id, "FINISHED", exp_driver.duration, experiment_utils._get_logdir(app_id, run_id), best_logdir, optimization_key, ) util._log("Finished Experiment") return result except: # noqa: E722 _exception_handler( experiment_utils._seconds_to_milliseconds(time.time() - job_start)) if exp_driver: if exp_driver.exception: raise exp_driver.exception raise finally: # grace period to send last logs to sparkmagic # sparkmagic hb poll intervall is 5 seconds, therefore wait 6 seconds time.sleep(6) # cleanup spark jobs if running and exp_driver is not None: exp_driver.stop() run_id += 1 running = False sc.setJobGroup("", "") return result
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() role = None logdir = None tb_hdfs_path = None client = parameter_server_reservation.Client(server_addr) try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] host_port = host + ":" + str(port) exec_spec = {} if executor_num < num_ps: exec_spec["task_type"] = "ps" else: exec_spec["task_type"] = "worker" exec_spec["host_port"] = host_port exec_spec["gpus_present"] = devices.get_num_gpus() > 0 client.register(exec_spec) cluster = client.await_reservations() tmp_socket.close() role, index = experiment_utils._find_task_and_index(host_port, cluster) cluster_spec = {} cluster_spec["cluster"] = cluster cluster_spec["task"] = {"type": role, "index": index} evaluator_node = None if evaluator: last_worker_index = len(cluster_spec["cluster"]["worker"])-1 evaluator_node = cluster_spec["cluster"]["worker"][last_worker_index] cluster_spec["cluster"]["evaluator"] = [evaluator_node] del cluster_spec["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: role = "evaluator" cluster_spec["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster_spec)) os.environ["TF_CONFIG"] = json.dumps(cluster_spec) logfile = experiment_utils._init_logger(experiment_utils._get_logdir(app_id, run_id), role=role, index=cluster_spec["task"]["index"]) dist_logdir = experiment_utils._get_logdir(app_id, run_id) + '/logdir' is_chief = (cluster["task"]["type"] == "chief") if is_chief: hdfs.mkdir(dist_logdir) tensorboard._register(dist_logdir, experiment_utils._get_logdir(app_id, run_id), executor_num, local_logdir=local_logdir) else: tensorboard.events_logdir = dist_logdir print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval=None if role == "ps": ps_thread = threading.Thread(target=lambda: map_fun()) ps_thread.start() client.await_all_workers_finished() else: retval = map_fun() if role == "chief": experiment_utils._handle_return_simple(retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff(task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: if role != "ps": client.register_worker_finished() client.close() experiment_utils._cleanup(tensorboard, t)