def _launch(sc, map_fun, args_dict=None, local_logdir=False, name="no-name"):
    """

    Args:
        sc:
        map_fun:
        args_dict:
        local_logdir:
        name:

    Returns:

    """
    global run_id

    app_id = str(sc.applicationId)

    if args_dict == None:
        num_executions = 1
    else:
        arg_lists = list(args_dict.values())
        currentLen = len(arg_lists[0])
        for i in range(len(arg_lists)):
            if currentLen != len(arg_lists[i]):
                raise ValueError(
                    'Length of each function argument list must be equal')
            num_executions = len(arg_lists[i])

    sc.setJobGroup("Launcher", "{} | Running experiment".format(name))
    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Force execution on executor, since GPU is located on executor    global run_id
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir))

    print('Finished Experiment \n')

    if args_dict == None:
        path_to_metric = _get_logdir(app_id) + '/metric'
        if pydoop.hdfs.path.exists(path_to_metric):
            with pydoop.hdfs.open(path_to_metric, "r") as fi:
                metric = float(fi.read())
                fi.close()
                return metric, hopshdfs._get_experiments_dir(
                ) + '/' + app_id + '/launcher/run.' + str(run_id)

    return None, hopshdfs._get_experiments_dir(
    ) + '/' + app_id + '/launcher/run.' + str(run_id)
Exemple #2
0
def visualize(hdfs_root_logdir):
    """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize
    them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs.

    Args:
      :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard
    """

    sc = util._find_spark().sparkContext
    app_id = str(sc.applicationId)

    pypath = os.getenv("PYSPARK_PYTHON")

    logdir = os.getcwd() + '/tensorboard_events/'
    if os.path.exists(logdir):
        shutil.rmtree(logdir)
        os.makedirs(logdir)
    else:
        os.makedirs(logdir)

        #find free port
    tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    tb_socket.bind(('', 0))
    tb_addr, tb_port = tb_socket.getsockname()

    tb_path = util._find_tensorboard()

    tb_socket.close()

    tb_env = os.environ.copy()
    tb_env['CUDA_VISIBLE_DEVICES'] = ''
    tb_env['LC_ALL'] = 'C'

    tb_proc = subprocess.Popen([
        pypath, tb_path,
        "--logdir=%s" % logdir,
        "--port=%d" % tb_port,
        "--host=%s" % "0.0.0.0"
    ],
                               env=tb_env,
                               preexec_fn=util._on_executor_exit('SIGTERM'))

    host = socket.gethostname()
    tb_url = "http://{0}:{1}".format(host, tb_port)
    tb_endpoint = hopshdfs._get_experiments_dir(
    ) + "/" + app_id + "/TensorBoard.visualize"
    #dump tb host:port to hdfs
    pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user())

    handle = hopshdfs.get()
    hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir)
    for entry in hdfs_logdir_entries:
        file_name, extension = splitext(entry['name'])
        if not extension == '.log':
            pydoop.hdfs.get(entry['name'], logdir)

    tb_proc.wait()
    stdout, stderr = tb_proc.communicate()
    print(stdout)
    print(stderr)
def _evolutionary_launch(spark_session, map_fun, args_dict, name="no-name"):
    """ Run the wrapper function with each hyperparameter combination as specified by the dictionary

    Args:
        :spark_session: SparkSession object
        :map_fun: The TensorFlow function to run
        :args_dict: (optional) A dictionary containing hyperparameter values to insert as arguments for each TensorFlow job
    """

    sc = spark_session.sparkContext
    app_id = str(sc.applicationId)

    arg_lists = list(args_dict.values())
    num_executions = len(arg_lists[0])

    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Force execution on executor, since GPU is located on executor
    global generation_id
    global run_id

    #Make SparkUI intuitive by grouping jobs
    sc.setJobGroup(
        "Differential Evolution ",
        "{} | Hyperparameter Optimization, generation: {}".format(
            name, generation_id))
    nodeRDD.foreachPartition(
        _prepare_func(app_id, generation_id, map_fun, args_dict, run_id))

    generation_id += 1

    return hopshdfs._get_experiments_dir() + '/' + app_id + "/"
Exemple #4
0
def _get_logdir(app_id):
    """

    Args:
        app_id:

    Returns:

    """
    global run_id
    return hopshdfs._get_experiments_dir() + '/' + app_id + '/begin/run.' +  str(run_id)
def _get_logdir(app_id):
    """

    Args:
        app_id:

    Returns:

    """
    global run_id
    return hopshdfs._get_experiments_dir() + '/' + app_id + '/collective_all_reduce/run.' + str(run_id)
def _get_logdir(app_id):
    """

    Args:
        :app_id:

    Returns:

    """
    global run_id
    return hopshdfs._get_experiments_dir() + "/" + app_id + "/differential_evolution/run." + str(run_id)
Exemple #7
0
def _launch(sc, map_fun, args_dict=None, local_logdir=False, name="no-name"):
    """

    Args:
        sc:
        map_fun:
        args_dict:
        local_logdir:
        name:

    Returns:

    """
    global run_id

    app_id = str(sc.applicationId)

    num_executions = 1
    sc.setJobGroup("MirroredStrategy",
                   "{} | Running on multiple devices".format(name))
    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Force execution on executor, since GPU is located on executor    global run_id
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir))

    print('Finished Experiment \n')

    path_to_metric = _get_logdir(app_id) + '/metric'
    if pydoop.hdfs.path.exists(path_to_metric):
        with pydoop.hdfs.open(path_to_metric, "r") as fi:
            metric = float(fi.read())
            fi.close()
            return metric, hopshdfs._get_experiments_dir(
            ) + '/' + app_id + '/mirrored/run.' + str(run_id)

    return None, hopshdfs._get_experiments_dir(
    ) + '/' + app_id + '/mirrored/run.' + str(run_id)
def _get_metric(param_string, app_id, generation_id, run_id):
    """

    Args:
        :param_string:
        :app_id:
        :generation_id:
        :run_id:

    Returns:

    """
    project_path = hopshdfs.project_path()
    handle = hopshdfs.get()
    for i in range(generation_id):
        possible_result_path = hopshdfs._get_experiments_dir() + '/' + app_id + '/differential_evolution/run.' \
                               + str(run_id) + '/generation.' + str(i) + '/' + param_string + '/metric'
        if handle.exists(possible_result_path):
            with pydoop.hdfs.open(possible_result_path, "r") as fi:
                metric = float(fi.read())
                fi.close()
                return metric

    return None
def _grid_launch(sc, map_fun, args_dict, direction='max', local_logdir=False, name="no-name"):
    """
    Run the wrapper function with each hyperparameter combination as specified by the dictionary

    Args:
        sc:
        map_fun:
        args_dict:
        direction:
        local_logdir:
        name:

    Returns:

    """
    global run_id
    app_id = str(sc.applicationId)
    num_executions = 1

    if direction != 'max' and direction != 'min':
        raise ValueError('Invalid direction ' + direction +  ', must be max or min')

    arg_lists = list(args_dict.values())
    currentLen = len(arg_lists[0])
    for i in range(len(arg_lists)):
        if currentLen != len(arg_lists[i]):
            raise ValueError('Length of each function argument list must be equal')
        num_executions = len(arg_lists[i])

    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Make SparkUI intuitive by grouping jobs
    sc.setJobGroup("Grid Search", "{} | Hyperparameter Optimization".format(name))

    #Force execution on executor, since GPU is located on executor
    job_start = datetime.datetime.now()
    nodeRDD.foreachPartition(_prepare_func(app_id, run_id, map_fun, args_dict, local_logdir))
    job_end = datetime.datetime.now()

    job_time_str = util._time_diff(job_start, job_end)

    arg_count = six.get_function_code(map_fun).co_argcount
    arg_names = six.get_function_code(map_fun).co_varnames
    hdfs_appid_dir = hopshdfs._get_experiments_dir() + '/' + app_id
    hdfs_runid_dir = _get_logdir(app_id)

    max_val, max_hp, min_val, min_hp, avg = _get_best(args_dict, num_executions, arg_names, arg_count, hdfs_appid_dir, run_id)

    param_combination = ""
    best_val = ""

    if direction == 'max':
        param_combination = max_hp
        best_val = str(max_val)
        results = '\n------ Grid Search results ------ direction(' + direction + ') \n' \
          'BEST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \
          'WORST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \
          'AVERAGE metric -- ' + str(avg) + '\n' \
          'Total job time ' + job_time_str + '\n'
        _write_result(hdfs_runid_dir, results)
        print(results)
    elif direction == 'min':
        param_combination = min_hp
        best_val = str(min_val)
        results = '\n------ Grid Search results ------ direction(' + direction + ') \n' \
        'BEST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \
        'WORST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \
        'AVERAGE metric -- ' + str(avg) + '\n' \
        'Total job time ' + job_time_str + '\n'
        _write_result(hdfs_runid_dir, results)
        print(results)


    print('Finished Experiment \n')

    return hdfs_runid_dir, param_combination, best_val
def _search(spark,
            function,
            search_dict,
            direction='max',
            generations=10,
            popsize=10,
            mutation=0.5,
            crossover=0.7,
            cleanup_generations=False,
            local_logdir=False,
            name="no-name"):
    """

    Args:
        :spark:
        :function:
        :search_dict:
        :direction:
        :generations:
        :popsize:
        :mutation:
        :crossover:
        :cleanup_generations:
        :local_logdir:
        :name:

    Returns:

    """

    global run_id
    global local_logdir_bool
    local_logdir_bool = local_logdir

    global spark_session
    spark_session = spark

    global objective_function
    objective_function = function

    global cleanup
    cleanup = cleanup_generations

    argcount = six.get_function_code(function).co_argcount
    arg_names = six.get_function_code(function).co_varnames

    ordered_arr = []

    app_id = spark.sparkContext.applicationId

    arg_lists = list(search_dict.values())
    for i in range(len(arg_lists)):
        if len(arg_lists[i]) != 2:
            raise ValueError(
                'Boundary list must contain exactly two elements, [lower_bound, upper_bound] for float/int '
                'or [category1, category2] in the case of strings')

    argIndex = 0
    while argcount != 0:
        ordered_arr.append(
            (arg_names[argIndex], search_dict[arg_names[argIndex]]))
        argcount = argcount - 1
        argIndex = argIndex + 1

    ordered_dict = OrderedDict(ordered_arr)

    bounds_list = []
    types_list = []

    for entry in ordered_dict:
        bounds_list.append((ordered_dict[entry][0], ordered_dict[entry][1]))

        if isinstance(ordered_dict[entry][0], int):
            types_list.append('int')
        elif isinstance(ordered_dict[entry][0], float):
            types_list.append('float')
        else:
            types_list.append('cat')

    global diff_evo
    diff_evo = DifferentialEvolution(_execute_all,
                                     bounds_list,
                                     types_list,
                                     ordered_dict,
                                     direction=direction,
                                     generations=generations,
                                     popsize=popsize,
                                     crossover=crossover,
                                     mutation=mutation,
                                     name=name)

    root_dir = hopshdfs._get_experiments_dir() + "/" + str(
        app_id) + "/differential_evolution/run." + str(run_id)

    best_param, best_metric = diff_evo._solve(root_dir)

    print('Finished Experiment \n')

    return str(root_dir), best_param, best_metric
Exemple #11
0
def _launch(sc,
            map_fun,
            args_dict,
            samples,
            direction='max',
            local_logdir=False,
            name="no-name"):
    """

    Args:
        sc:
        map_fun:
        args_dict:
        local_logdir:
        name:

    Returns:

    """
    global run_id

    app_id = str(sc.applicationId)

    arg_lists = list(args_dict.values())
    for i in range(len(arg_lists)):
        if len(arg_lists[i]) != 2:
            raise ValueError(
                'Boundary list must contain exactly two elements, [lower_bound, upper_bound] for each hyperparameter'
            )

    hp_names = args_dict.keys()

    random_dict = {}
    for hp in hp_names:
        lower_bound = args_dict[hp][0]
        upper_bound = args_dict[hp][1]

        assert lower_bound < upper_bound, "lower bound: " + str(
            lower_bound) + " must be less than upper bound: " + str(
                upper_bound)

        random_values = []

        if type(lower_bound) == int and type(upper_bound) == int:
            for i in range(samples):
                random_values.append(random.randint(lower_bound, upper_bound))
        elif type(lower_bound) == float and type(upper_bound) == float:
            for i in range(samples):
                random_values.append(random.uniform(lower_bound, upper_bound))
        else:
            raise ValueError('Only float and int is currently supported')

        random_dict[hp] = random_values

    random_dict, new_samples = _remove_duplicates(random_dict, samples)

    sc.setJobGroup("Random Search",
                   "{} | Hyperparameter Optimization".format(name))
    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(new_samples), new_samples)

    job_start = datetime.datetime.now()
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, random_dict, local_logdir))
    job_end = datetime.datetime.now()

    job_time_str = util._time_diff(job_start, job_end)

    arg_count = six.get_function_code(map_fun).co_argcount
    arg_names = six.get_function_code(map_fun).co_varnames
    hdfs_appid_dir = hopshdfs._get_experiments_dir() + '/' + app_id
    hdfs_runid_dir = _get_logdir(app_id)

    max_val, max_hp, min_val, min_hp, avg = _get_best(random_dict, new_samples,
                                                      arg_names, arg_count,
                                                      hdfs_appid_dir, run_id)

    param_combination = ""
    best_val = ""

    if direction == 'max':
        param_combination = max_hp
        best_val = str(max_val)
        results = '\n------ Random Search results ------ direction(' + direction + ') \n' \
        'BEST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \
        'WORST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \
        'AVERAGE metric -- ' + str(avg) + '\n' \
        'Total job time ' + job_time_str + '\n'
        _write_result(hdfs_runid_dir, results)
        print(results)
    elif direction == 'min':
        param_combination = min_hp
        best_val = str(min_val)
        results = '\n------ Random Search results ------ direction(' + direction + ') \n' \
        'BEST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \
        'WORST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \
        'AVERAGE metric -- ' + str(avg) + '\n' \
        'Total job time ' + job_time_str + '\n'
        _write_result(hdfs_runid_dir, results)
        print(results)

    print('Finished Experiment \n')

    return hdfs_runid_dir, param_combination, best_val