def export(local_model_path, model_name, model_version): project_path = hdfs.project_path() # Create directory with model name hdfs_handle = hdfs.get() model_name_root_directory = project_path + '/Models/' + str(model_name) + '/' + str(model_version) + '/' hdfs_handle.create_directory(model_name_root_directory) for (path, dirs, files) in os.walk(local_model_path): hdfs_export_subpath = path.replace(local_model_path, '') current_hdfs_dir = model_name_root_directory + '/' + hdfs_export_subpath if not hdfs_handle.exists(current_hdfs_dir): hdfs_handle.create_directory(model_name_root_directory) for f in files: if not hdfs_handle.exists(current_hdfs_dir + '/' + f): pydoop.hdfs.put(path + '/' + f, current_hdfs_dir) for d in dirs: if not hdfs_handle.exists(current_hdfs_dir + '/' + d): pydoop.hdfs.put(path + '/' + d, current_hdfs_dir + '/') break
def _version_resources(versioned_resources, rundir): """ Args: versioned_resources: rundir: Returns: """ if not versioned_resources: return None pyhdfs_handle = hdfs.get() pyhdfs_handle.create_directory(rundir) endpoint_prefix = hdfs.project_path() versioned_paths = [] for hdfs_resource in versioned_resources: if pydoop.hdfs.path.exists(hdfs_resource): log("Versoning resource '%s' in rundir '%s'" % (hdfs_resource, rundir)) # Remove the file if it exists target_path = os.path.join(rundir, os.path.basename(hdfs_resource)) if hdfs.exists(target_path): hdfs.rmr(target_path) hdfs.cp(hdfs_resource, rundir) path, filename = os.path.split(hdfs_resource) versioned_paths.append( rundir.replace(endpoint_prefix, '') + '/' + filename) else: log("Resource not found '%s'" % hdfs_resource, level='warning') #raise Exception('Could not find resource in specified path: ' + hdfs_resource) return ', '.join(versioned_paths)
def _version_resources(versioned_resources, rundir): """ Args: versioned_resources: rundir: Returns: """ if not versioned_resources: return None pyhdfs_handle = hdfs.get() pyhdfs_handle.create_directory(rundir) endpoint_prefix = hdfs.project_path() versioned_paths = [] for hdfs_resource in versioned_resources: if pydoop.hdfs.path.exists(hdfs_resource): pyhdfs_handle.copy(hdfs_resource, pyhdfs_handle, rundir) path, filename = os.path.split(hdfs_resource) versioned_paths.append( rundir.replace(endpoint_prefix, '') + '/' + filename) else: raise Exception('Could not find resource in specified path: ' + hdfs_resource) return ', '.join(versioned_paths)
def visualize(hdfs_root_logdir): """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs. Args: :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard """ sc = util._find_spark().sparkContext app_id = str(sc.applicationId) pypath = os.getenv("PYSPARK_PYTHON") logdir = os.getcwd() + '/tensorboard_events/' if os.path.exists(logdir): shutil.rmtree(logdir) os.makedirs(logdir) else: os.makedirs(logdir) #find free port tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_socket.bind(('', 0)) tb_addr, tb_port = tb_socket.getsockname() tb_path = util._find_tensorboard() tb_socket.close() tb_env = os.environ.copy() tb_env['CUDA_VISIBLE_DEVICES'] = '' tb_env['LC_ALL'] = 'C' tb_proc = subprocess.Popen([ pypath, tb_path, "--logdir=%s" % logdir, "--port=%d" % tb_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) host = socket.gethostname() tb_url = "http://{0}:{1}".format(host, tb_port) tb_endpoint = hopshdfs._get_experiments_dir( ) + "/" + app_id + "/TensorBoard.visualize" #dump tb host:port to hdfs pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user()) handle = hopshdfs.get() hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir) for entry in hdfs_logdir_entries: file_name, extension = splitext(entry['name']) if not extension == '.log': pydoop.hdfs.get(entry['name'], logdir) tb_proc.wait() stdout, stderr = tb_proc.communicate() print(stdout) print(stderr)
def cleanup(tb_hdfs_path): hopshdfs.log('Performing cleanup') handle = hopshdfs.get() if not tb_hdfs_path == None and not tb_hdfs_path == '' and handle.exists( tb_hdfs_path): handle.delete(tb_hdfs_path) hopshdfs.kill_logger()
def _create_experiment_subdirectories(app_id, run_id, param_string, type, sub_type=None, params=None): """ Creates directories for an experiment, if Experiments folder exists it will create directories below it, otherwise it will create them in the Logs directory. Args: :app_id: YARN application ID of the experiment :run_id: Experiment ID :param_string: name of the new directory created under parent directories :type: type of the new directory parent, e.g differential_evolution :sub_type: type of sub directory to parent, e.g generation :params: dict of hyperparameters Returns: The new directories for the yarn-application and for the execution (hdfs_exec_logdir, hdfs_appid_logdir) """ pyhdfs_handle = hdfs.get() hdfs_events_parent_dir = hdfs.project_path() + "Experiments" hdfs_experiment_dir = hdfs_events_parent_dir + "/" + app_id + "_" + str( run_id) # determine directory structure based on arguments if sub_type: hdfs_exec_logdir = hdfs_experiment_dir + "/" + str( sub_type) + '/' + str(param_string) if pyhdfs_handle.exists(hdfs_exec_logdir): hdfs.delete(hdfs_exec_logdir, recursive=True) elif not param_string and not sub_type: if pyhdfs_handle.exists(hdfs_experiment_dir): hdfs.delete(hdfs_experiment_dir, recursive=True) hdfs_exec_logdir = hdfs_experiment_dir + '/' else: hdfs_exec_logdir = hdfs_experiment_dir + '/' + str(param_string) if pyhdfs_handle.exists(hdfs_exec_logdir): hdfs.delete(hdfs_exec_logdir, recursive=True) # Need to remove directory if it exists (might be a task retry) # create the new directory pyhdfs_handle.create_directory(hdfs_exec_logdir) return_file = hdfs_exec_logdir + '/.hparams.json' hdfs.dump(dumps(params), return_file) return hdfs_exec_logdir, hdfs_experiment_dir
def _get_metric(param_string, app_id, generation_id, run_id): project_path = hopshdfs.project_path() handle = hopshdfs.get() for i in range(generation_id): possible_result_path = hopshdfs.get_experiments_dir() + '/' + app_id + '/differential_evolution/run.' \ + str(run_id) + '/generation.' + str(i) + '/' + param_string + '/metric' if handle.exists(possible_result_path): with pydoop.hdfs.open(possible_result_path, "r") as fi: metric = float(fi.read()) fi.close() return metric return None
def _cleanup(tb_hdfs_path): """ Args: tb_hdfs_path: Returns: """ handle = hopshdfs.get() if not tb_hdfs_path == None and not tb_hdfs_path == '' and handle.exists(tb_hdfs_path): handle.delete(tb_hdfs_path) hopshdfs._kill_logger()
def _cleanup(tensorboard, gpu_thread): print("Cleaning up... ") # Kill running TB try: if tensorboard.tb_pid != 0: subprocess.Popen(["kill", str(tensorboard.tb_pid)]) except Exception as err: print('Exception occurred while killing tensorboard: {}'.format(err)) pass # Store local TB in hdfs try: if tensorboard.local_logdir_bool and tensorboard.events_logdir: _store_local_tensorboard(tensorboard.local_logdir_path, tensorboard.events_logdir) except Exception as err: print('Exception occurred while uploading local logdir to hdfs: {}'. format(err)) pass # Get rid of TensorBoard endpoint file try: handle = hdfs.get() if tensorboard.endpoint and handle.exists(tensorboard.endpoint): handle.delete(tensorboard.endpoint) except Exception as err: print( 'Exception occurred while deleting tensorboard endpoint file: {}'. format(err)) pass finally: tensorboard._reset_global() # Close and logging fd and flush try: _close_logger() except Exception as err: print('Exception occurred while closing logger: {}'.format(err)) pass # Stop the gpu monitoring thread try: gpu_thread.do_run = False except Exception as err: print('Exception occurred while stopping GPU monitoring thread: {}'. format(err)) pass
def end(metric=None): """ End a custom Experiment previously registered with *begin* and register a metric to associate with it. Args: :metric: The metric to associate with the Experiment """ global running global experiment_json global elastic_id global driver_tensorboard_hdfs_path global app_id if not running: raise RuntimeError( "An experiment is not running. Did you forget to call experiment.end()?" ) try: if metric: experiment_json = util._finalize_experiment( experiment_json, None, str(metric)) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) else: experiment_json = util._finalize_experiment( experiment_json, None, None) util._put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: _exception_handler() raise finally: elastic_id += 1 running = False handle = hopshdfs.get() if tensorboard.tb_pid != 0: subprocess.Popen(["kill", str(tensorboard.tb_pid)]) if tensorboard.local_logdir_bool: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, tensorboard.events_logdir) if not tensorboard.endpoint == None and not tensorboard.endpoint == '' \ and handle.exists(tensorboard.endpoint): handle.delete(tensorboard.endpoint) hopshdfs._kill_logger()
def _get_return_file(param_string, app_id, generation_id, run_id): """ Args: :param_string: :app_id: :generation_id: :run_id: Returns: """ handle = hdfs.get() for i in range(generation_id): possible_result_path = experiment_utils._get_experiments_dir() + '/' + app_id + '_' \ + str(run_id) + '/generation.' + str(i) + '/' + param_string + '/.outputs.json' if handle.exists(possible_result_path): return_file_contents = hdfs.load(possible_result_path) return return_file_contents return None
def end(metric=None): global running global experiment_json global elastic_id global driver_tensorboard_hdfs_path global app_id if not running: raise RuntimeError( "An experiment is not running. Did you forget to call experiment.end()?" ) try: if metric: experiment_json = util.finalize_experiment(experiment_json, None, str(metric)) util.put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) else: experiment_json = util.finalize_experiment(experiment_json, None, None) util.put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) except: exception_handler() raise finally: elastic_id += 1 running = False handle = hopshdfs.get() if tensorboard.tb_pid != 0: subprocess.Popen(["kill", str(tensorboard.tb_pid)]) if tensorboard.local_logdir_bool: local_tb = tensorboard.local_logdir_path util.store_local_tensorboard(local_tb, tensorboard.events_logdir) if not tensorboard.endpoint == None and not tensorboard.endpoint == '' \ and handle.exists(tensorboard.endpoint): handle.delete(tensorboard.endpoint) hopshdfs.kill_logger()
def _cleanup(tb_hdfs_path): handle = hopshdfs.get() if not tb_hdfs_path == None and not tb_hdfs_path == '' and handle.exists( tb_hdfs_path): handle.delete(tb_hdfs_path) hopshdfs.kill_logger()