Exemple #1
0
def export(local_model_path, model_name, model_version):

    project_path = hdfs.project_path()

    # Create directory with model name
    hdfs_handle = hdfs.get()
    model_name_root_directory = project_path + '/Models/' + str(model_name) + '/' + str(model_version) + '/'
    hdfs_handle.create_directory(model_name_root_directory)

    for (path, dirs, files) in os.walk(local_model_path):

        hdfs_export_subpath = path.replace(local_model_path, '')

        current_hdfs_dir = model_name_root_directory + '/' + hdfs_export_subpath

        if not hdfs_handle.exists(current_hdfs_dir):
            hdfs_handle.create_directory(model_name_root_directory)

        for f in files:
            if not hdfs_handle.exists(current_hdfs_dir + '/' + f):
                pydoop.hdfs.put(path + '/' + f, current_hdfs_dir)

        for d in dirs:
            if not hdfs_handle.exists(current_hdfs_dir + '/' + d):
                pydoop.hdfs.put(path + '/' + d, current_hdfs_dir + '/')
        break
Exemple #2
0
def _version_resources(versioned_resources, rundir):
    """

    Args:
        versioned_resources:
        rundir:

    Returns:

    """
    if not versioned_resources:
        return None
    pyhdfs_handle = hdfs.get()
    pyhdfs_handle.create_directory(rundir)
    endpoint_prefix = hdfs.project_path()
    versioned_paths = []
    for hdfs_resource in versioned_resources:
        if pydoop.hdfs.path.exists(hdfs_resource):
            log("Versoning resource '%s' in rundir '%s'" %
                (hdfs_resource, rundir))

            # Remove the file if it exists
            target_path = os.path.join(rundir, os.path.basename(hdfs_resource))
            if hdfs.exists(target_path):
                hdfs.rmr(target_path)

            hdfs.cp(hdfs_resource, rundir)
            path, filename = os.path.split(hdfs_resource)
            versioned_paths.append(
                rundir.replace(endpoint_prefix, '') + '/' + filename)
        else:
            log("Resource not found '%s'" % hdfs_resource, level='warning')
            #raise Exception('Could not find resource in specified path: ' + hdfs_resource)

    return ', '.join(versioned_paths)
Exemple #3
0
def _version_resources(versioned_resources, rundir):
    """

    Args:
        versioned_resources:
        rundir:

    Returns:

    """
    if not versioned_resources:
        return None
    pyhdfs_handle = hdfs.get()
    pyhdfs_handle.create_directory(rundir)
    endpoint_prefix = hdfs.project_path()
    versioned_paths = []
    for hdfs_resource in versioned_resources:
        if pydoop.hdfs.path.exists(hdfs_resource):
            pyhdfs_handle.copy(hdfs_resource, pyhdfs_handle, rundir)
            path, filename = os.path.split(hdfs_resource)
            versioned_paths.append(
                rundir.replace(endpoint_prefix, '') + '/' + filename)
        else:
            raise Exception('Could not find resource in specified path: ' +
                            hdfs_resource)

    return ', '.join(versioned_paths)
Exemple #4
0
def visualize(hdfs_root_logdir):
    """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize
    them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs.

    Args:
      :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard
    """

    sc = util._find_spark().sparkContext
    app_id = str(sc.applicationId)

    pypath = os.getenv("PYSPARK_PYTHON")

    logdir = os.getcwd() + '/tensorboard_events/'
    if os.path.exists(logdir):
        shutil.rmtree(logdir)
        os.makedirs(logdir)
    else:
        os.makedirs(logdir)

        #find free port
    tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    tb_socket.bind(('', 0))
    tb_addr, tb_port = tb_socket.getsockname()

    tb_path = util._find_tensorboard()

    tb_socket.close()

    tb_env = os.environ.copy()
    tb_env['CUDA_VISIBLE_DEVICES'] = ''
    tb_env['LC_ALL'] = 'C'

    tb_proc = subprocess.Popen([
        pypath, tb_path,
        "--logdir=%s" % logdir,
        "--port=%d" % tb_port,
        "--host=%s" % "0.0.0.0"
    ],
                               env=tb_env,
                               preexec_fn=util._on_executor_exit('SIGTERM'))

    host = socket.gethostname()
    tb_url = "http://{0}:{1}".format(host, tb_port)
    tb_endpoint = hopshdfs._get_experiments_dir(
    ) + "/" + app_id + "/TensorBoard.visualize"
    #dump tb host:port to hdfs
    pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user())

    handle = hopshdfs.get()
    hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir)
    for entry in hdfs_logdir_entries:
        file_name, extension = splitext(entry['name'])
        if not extension == '.log':
            pydoop.hdfs.get(entry['name'], logdir)

    tb_proc.wait()
    stdout, stderr = tb_proc.communicate()
    print(stdout)
    print(stderr)
def cleanup(tb_hdfs_path):
    hopshdfs.log('Performing cleanup')
    handle = hopshdfs.get()
    if not tb_hdfs_path == None and not tb_hdfs_path == '' and handle.exists(
            tb_hdfs_path):
        handle.delete(tb_hdfs_path)
    hopshdfs.kill_logger()
Exemple #6
0
def _create_experiment_subdirectories(app_id,
                                      run_id,
                                      param_string,
                                      type,
                                      sub_type=None,
                                      params=None):
    """
    Creates directories for an experiment, if Experiments folder exists it will create directories
    below it, otherwise it will create them in the Logs directory.

    Args:
        :app_id: YARN application ID of the experiment
        :run_id: Experiment ID
        :param_string: name of the new directory created under parent directories
        :type: type of the new directory parent, e.g differential_evolution
        :sub_type: type of sub directory to parent, e.g generation
        :params: dict of hyperparameters

    Returns:
        The new directories for the yarn-application and for the execution (hdfs_exec_logdir, hdfs_appid_logdir)
    """

    pyhdfs_handle = hdfs.get()

    hdfs_events_parent_dir = hdfs.project_path() + "Experiments"

    hdfs_experiment_dir = hdfs_events_parent_dir + "/" + app_id + "_" + str(
        run_id)

    # determine directory structure based on arguments
    if sub_type:
        hdfs_exec_logdir = hdfs_experiment_dir + "/" + str(
            sub_type) + '/' + str(param_string)
        if pyhdfs_handle.exists(hdfs_exec_logdir):
            hdfs.delete(hdfs_exec_logdir, recursive=True)
    elif not param_string and not sub_type:
        if pyhdfs_handle.exists(hdfs_experiment_dir):
            hdfs.delete(hdfs_experiment_dir, recursive=True)
        hdfs_exec_logdir = hdfs_experiment_dir + '/'
    else:
        hdfs_exec_logdir = hdfs_experiment_dir + '/' + str(param_string)
        if pyhdfs_handle.exists(hdfs_exec_logdir):
            hdfs.delete(hdfs_exec_logdir, recursive=True)

    # Need to remove directory if it exists (might be a task retry)

    # create the new directory
    pyhdfs_handle.create_directory(hdfs_exec_logdir)

    return_file = hdfs_exec_logdir + '/.hparams.json'
    hdfs.dump(dumps(params), return_file)

    return hdfs_exec_logdir, hdfs_experiment_dir
def _get_metric(param_string, app_id, generation_id, run_id):
    project_path = hopshdfs.project_path()
    handle = hopshdfs.get()
    for i in range(generation_id):
        possible_result_path = hopshdfs.get_experiments_dir() + '/' + app_id + '/differential_evolution/run.' \
                               + str(run_id) + '/generation.' + str(i) + '/' + param_string + '/metric'
        if handle.exists(possible_result_path):
            with pydoop.hdfs.open(possible_result_path, "r") as fi:
                metric = float(fi.read())
                fi.close()
                return metric

    return None
def _cleanup(tb_hdfs_path):
    """

    Args:
        tb_hdfs_path:

    Returns:

    """
    handle = hopshdfs.get()
    if not tb_hdfs_path == None and not tb_hdfs_path == '' and handle.exists(tb_hdfs_path):
        handle.delete(tb_hdfs_path)
    hopshdfs._kill_logger()
def _cleanup(tensorboard, gpu_thread):

    print("Cleaning up... ")

    # Kill running TB
    try:
        if tensorboard.tb_pid != 0:
            subprocess.Popen(["kill", str(tensorboard.tb_pid)])
    except Exception as err:
        print('Exception occurred while killing tensorboard: {}'.format(err))
        pass

    # Store local TB in hdfs
    try:
        if tensorboard.local_logdir_bool and tensorboard.events_logdir:
            _store_local_tensorboard(tensorboard.local_logdir_path,
                                     tensorboard.events_logdir)
    except Exception as err:
        print('Exception occurred while uploading local logdir to hdfs: {}'.
              format(err))
        pass

    # Get rid of TensorBoard endpoint file
    try:
        handle = hdfs.get()
        if tensorboard.endpoint and handle.exists(tensorboard.endpoint):
            handle.delete(tensorboard.endpoint)
    except Exception as err:
        print(
            'Exception occurred while deleting tensorboard endpoint file: {}'.
            format(err))
        pass
    finally:
        tensorboard._reset_global()

    # Close and logging fd and flush
    try:
        _close_logger()
    except Exception as err:
        print('Exception occurred while closing logger: {}'.format(err))
        pass

    # Stop the gpu monitoring thread
    try:
        gpu_thread.do_run = False
    except Exception as err:
        print('Exception occurred while stopping GPU monitoring thread: {}'.
              format(err))
        pass
def end(metric=None):
    """
    End a custom Experiment previously registered with *begin* and register a metric to associate with it.

    Args:
        :metric: The metric to associate with the Experiment

    """
    global running
    global experiment_json
    global elastic_id
    global driver_tensorboard_hdfs_path
    global app_id
    if not running:
        raise RuntimeError(
            "An experiment is not running. Did you forget to call experiment.end()?"
        )
    try:
        if metric:
            experiment_json = util._finalize_experiment(
                experiment_json, None, str(metric))
            util._put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                              experiment_json)
        else:
            experiment_json = util._finalize_experiment(
                experiment_json, None, None)
            util._put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                              experiment_json)
    except:
        _exception_handler()
        raise
    finally:
        elastic_id += 1
        running = False
        handle = hopshdfs.get()

        if tensorboard.tb_pid != 0:
            subprocess.Popen(["kill", str(tensorboard.tb_pid)])

        if tensorboard.local_logdir_bool:
            local_tb = tensorboard.local_logdir_path
            util._store_local_tensorboard(local_tb, tensorboard.events_logdir)

        if not tensorboard.endpoint == None and not tensorboard.endpoint == '' \
                and handle.exists(tensorboard.endpoint):
            handle.delete(tensorboard.endpoint)
        hopshdfs._kill_logger()
Exemple #11
0
def _get_return_file(param_string, app_id, generation_id, run_id):
    """

    Args:
        :param_string:
        :app_id:
        :generation_id:
        :run_id:

    Returns:

    """
    handle = hdfs.get()
    for i in range(generation_id):
        possible_result_path = experiment_utils._get_experiments_dir() + '/' + app_id + '_' \
                               + str(run_id) + '/generation.' + str(i) + '/' + param_string + '/.outputs.json'
        if handle.exists(possible_result_path):
            return_file_contents = hdfs.load(possible_result_path)
            return return_file_contents

    return None
Exemple #12
0
def end(metric=None):
    global running
    global experiment_json
    global elastic_id
    global driver_tensorboard_hdfs_path
    global app_id
    if not running:
        raise RuntimeError(
            "An experiment is not running. Did you forget to call experiment.end()?"
        )
    try:
        if metric:
            experiment_json = util.finalize_experiment(experiment_json, None,
                                                       str(metric))
            util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                             experiment_json)
        else:
            experiment_json = util.finalize_experiment(experiment_json, None,
                                                       None)
            util.put_elastic(hopshdfs.project_name(), app_id, elastic_id,
                             experiment_json)
    except:
        exception_handler()
        raise
    finally:
        elastic_id += 1
        running = False
        handle = hopshdfs.get()

        if tensorboard.tb_pid != 0:
            subprocess.Popen(["kill", str(tensorboard.tb_pid)])

        if tensorboard.local_logdir_bool:
            local_tb = tensorboard.local_logdir_path
            util.store_local_tensorboard(local_tb, tensorboard.events_logdir)

        if not tensorboard.endpoint == None and not tensorboard.endpoint == '' \
                and handle.exists(tensorboard.endpoint):
            handle.delete(tensorboard.endpoint)
        hopshdfs.kill_logger()
def _cleanup(tb_hdfs_path):
    handle = hopshdfs.get()
    if not tb_hdfs_path == None and not tb_hdfs_path == '' and handle.exists(
            tb_hdfs_path):
        handle.delete(tb_hdfs_path)
    hopshdfs.kill_logger()