Beispiel #1
0
def launch(sc, map_fun, args_dict=None, local_logdir=False):

    global run_id

    app_id = str(sc.applicationId)


    if args_dict == None:
        num_executions = 1
    else:
        arg_lists = list(args_dict.values())
        currentLen = len(arg_lists[0])
        for i in range(len(arg_lists)):
            if currentLen != len(arg_lists[i]):
                raise ValueError('Length of each function argument list must be equal')
            num_executions = len(arg_lists[i])

    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Force execution on executor, since GPU is located on executor    global run_id
    nodeRDD.foreachPartition(_prepare_func(app_id, run_id, map_fun, args_dict, local_logdir))

    print('Finished Experiment \n')

    if args_dict == None:
        path_to_metric = get_logdir(app_id) + '/metric'
        if pydoop.hdfs.path.exists(path_to_metric):
            with pydoop.hdfs.open(path_to_metric, "r") as fi:
                metric = float(fi.read())
                fi.close()
                return metric, hopshdfs.get_experiments_dir() + '/' + app_id + '/launcher/run.' +  str(run_id)

    return None, hopshdfs.get_experiments_dir() + '/' + app_id + '/launcher/run.' +  str(run_id)
def _evolutionary_launch(spark_session, map_fun, args_dict=None):
    """ Run the wrapper function with each hyperparameter combination as specified by the dictionary

    Args:
      :spark_session: SparkSession object
      :map_fun: The TensorFlow function to run
      :args_dict: (optional) A dictionary containing hyperparameter values to insert as arguments for each TensorFlow job
    """

    sc = spark_session.sparkContext
    app_id = str(sc.applicationId)

    if args_dict == None:
        num_executions = 1
    else:
        arg_lists = list(args_dict.values())
        currentLen = len(arg_lists[0])
        for i in range(len(arg_lists)):
            if currentLen != len(arg_lists[i]):
                raise ValueError(
                    'Length of each function argument list must be equal')
            num_executions = len(arg_lists[i])

    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Force execution on executor, since GPU is located on executor
    global generation_id
    global run_id
    nodeRDD.foreachPartition(
        _prepare_func(app_id, generation_id, map_fun, args_dict, run_id))

    generation_id += 1

    return hopshdfs.get_experiments_dir() + '/' + app_id + "/"
Beispiel #3
0
def visualize(spark_session, hdfs_root_logdir):
    """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize
    them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs.

    Args:
      :spark_session: SparkSession object
      :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard
    """

    sc = spark_session.sparkContext
    app_id = str(sc.applicationId)

    pypath = os.getenv("PYSPARK_PYTHON")

    logdir = os.getcwd() + '/tensorboard_events/'
    if os.path.exists(logdir):
        shutil.rmtree(logdir)
        os.makedirs(logdir)
    else:
        os.makedirs(logdir)

        #find free port
    tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    tb_socket.bind(('', 0))
    tb_addr, tb_port = tb_socket.getsockname()

    tb_path = util.find_tensorboard()

    tb_socket.close()

    tb_env = os.environ.copy()
    tb_env['CUDA_VISIBLE_DEVICES'] = ''

    tb_proc = subprocess.Popen(
        [pypath, tb_path,
         "--logdir=%s" % logdir,
         "--port=%d" % tb_port],
        env=tb_env,
        preexec_fn=util.on_executor_exit('SIGTERM'))

    host = socket.gethostname()
    tb_url = "http://{0}:{1}".format(host, tb_port)
    tb_endpoint = hopshdfs.get_experiments_dir(
    ) + "/" + app_id + "/TensorBoard.driver"
    #dump tb host:port to hdfs
    pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user())

    handle = hopshdfs.get()
    hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir)
    for entry in hdfs_logdir_entries:
        file_name, extension = splitext(entry['name'])
        if not extension == '.log':
            pydoop.hdfs.get(entry['name'], logdir)

    tb_proc.wait()
    stdout, stderr = tb_proc.communicate()
    print(stdout)
    print(stderr)
def _get_metric(param_string, app_id, generation_id, run_id):
    project_path = hopshdfs.project_path()
    handle = hopshdfs.get()
    for i in range(generation_id):
        possible_result_path = hopshdfs.get_experiments_dir() + '/' + app_id + '/differential_evolution/run.' \
                               + str(run_id) + '/generation.' + str(i) + '/' + param_string + '/metric'
        if handle.exists(possible_result_path):
            with pydoop.hdfs.open(possible_result_path, "r") as fi:
                metric = float(fi.read())
                fi.close()
                return metric

    return None
def get_logdir(app_id):
    global run_id
    return hopshdfs.get_experiments_dir(
    ) + "/" + app_id + "/differential_evolution/run." + str(run_id)
def _search(spark,
            function,
            search_dict,
            direction='max',
            generations=10,
            popsize=10,
            mutation=0.5,
            crossover=0.7,
            cleanup_generations=False,
            local_logdir=False):

    global run_id
    global local_logdir_bool
    local_logdir_bool = local_logdir

    global spark_session
    spark_session = spark

    global objective_function
    objective_function = function

    global cleanup
    cleanup = cleanup_generations

    argcount = six.get_function_code(function).co_argcount
    arg_names = six.get_function_code(function).co_varnames

    ordered_arr = []

    app_id = spark.sparkContext.applicationId

    argIndex = 0
    while argcount != 0:
        ordered_arr.append(
            (arg_names[argIndex], search_dict[arg_names[argIndex]]))
        argcount = argcount - 1
        argIndex = argIndex + 1

    ordered_dict = OrderedDict(ordered_arr)

    bounds_list = []
    types_list = []

    for entry in ordered_dict:
        bounds_list.append((ordered_dict[entry][0], ordered_dict[entry][1]))

        if isinstance(ordered_dict[entry][0], int):
            types_list.append('int')
        elif isinstance(ordered_dict[entry][0], float):
            types_list.append('float')
        else:
            types_list.append('cat')

    global diff_evo
    diff_evo = DifferentialEvolution(execute_all,
                                     bounds_list,
                                     types_list,
                                     ordered_dict,
                                     direction=direction,
                                     generations=generations,
                                     popsize=popsize,
                                     crossover=crossover,
                                     mutation=mutation)

    root_dir = hopshdfs.get_experiments_dir() + "/" + str(
        app_id) + "/differential_evolution/run." + str(run_id)

    best_param, best_metric = diff_evo.solve(root_dir)

    print('Finished Experiment \n')

    return str(root_dir), best_param, best_metric
Beispiel #7
0
def get_logdir(app_id):
    global run_id
    return hopshdfs.get_experiments_dir(
    ) + '/' + app_id + '/tensorflowonspark/run.' + str(run_id)
Beispiel #8
0
def get_logdir(app_id):
    global run_id
    return hopshdfs.get_experiments_dir(
    ) + '/' + app_id + '/grid_search/run.' + str(run_id)
Beispiel #9
0
def _grid_launch(sc, map_fun, args_dict, direction='max', local_logdir=False):
    """ Run the wrapper function with each hyperparameter combination as specified by the dictionary

    Args:
      :spark_session: SparkSession object
      :map_fun: The TensorFlow function to run
      :args_dict: (optional) A dictionary containing hyperparameter values to insert as arguments for each TensorFlow job
    """
    global run_id
    app_id = str(sc.applicationId)
    num_executions = 1

    if direction != 'max' and direction != 'min':
        raise ValueError('Invalid direction ' + direction +
                         ', must be max or min')

    arg_lists = list(args_dict.values())
    currentLen = len(arg_lists[0])
    for i in range(len(arg_lists)):
        if currentLen != len(arg_lists[i]):
            raise ValueError(
                'Length of each function argument list must be equal')
        num_executions = len(arg_lists[i])

    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Force execution on executor, since GPU is located on executor
    job_start = datetime.datetime.now()
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, args_dict, local_logdir))
    job_end = datetime.datetime.now()

    job_time_str = util.time_diff(job_start, job_end)

    arg_count = six.get_function_code(map_fun).co_argcount
    arg_names = six.get_function_code(map_fun).co_varnames
    hdfs_appid_dir = hopshdfs.get_experiments_dir() + '/' + app_id
    hdfs_runid_dir = hdfs_appid_dir + '/grid_search/run.' + str(run_id)

    max_val, max_hp, min_val, min_hp, avg = _get_best(args_dict,
                                                      num_executions,
                                                      arg_names, arg_count,
                                                      hdfs_appid_dir, run_id)

    param_combination = ""
    best_val = ""

    if direction == 'max':
        param_combination = max_hp
        best_val = str(max_val)
        results = '\n------ Grid search results ------ direction(' + direction + ') \n' \
          'BEST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \
          'WORST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \
          'AVERAGE metric -- ' + str(avg) + '\n' \
          'Total job time ' + job_time_str + '\n'
        write_result(hdfs_runid_dir, results)
        print(results)
    elif direction == 'min':
        param_combination = min_hp
        best_val = str(min_val)
        results = '\n------ Grid search results ------ direction(' + direction + ') \n' \
        'BEST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \
        'WORST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \
        'AVERAGE metric -- ' + str(avg) + '\n' \
        'Total job time ' + job_time_str + '\n'
        write_result(hdfs_runid_dir, results)
        print(results)

    print('Finished Experiment \n')

    return hdfs_runid_dir, param_combination, best_val
Beispiel #10
0
def get_logdir(app_id):
    global run_id
    return hopshdfs.get_experiments_dir() + '/' + app_id + '/launcher/run.' +  str(run_id)