Example #1
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        task_index = None

        try:
            host = util._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]

            client = allreduce_reservation.Client(server_addr)
            host_port = host + ":" + str(port)

            client.register({"worker": host_port, "index": executor_num})
            cluster = client.await_reservations()
            tmp_socket.close()
            client.close()

            task_index = _find_index(host_port, cluster)

            cluster["task"] = {"type": "worker", "index": task_index}

            os.environ["TF_CONFIG"] = json.dumps(cluster)

            if task_index == 0:
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, None, 'collective_all_reduce')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
            gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
            )
            if task_index == 0:
                hopshdfs.log(gpu_str)
            print(gpu_str)
            print('-------------------------------------------------------')
            print('Started running task \n')
            if task_index == 0:
                hopshdfs.log('Started running task')
            task_start = datetime.datetime.now()

            retval = map_fun()
            if task_index == 0:
                if retval:
                    _handle_return(retval, hdfs_exec_logdir)
            task_end = datetime.datetime.now()
            time_str = 'Finished task - took ' + util._time_diff(
                task_start, task_end)
            print('\n' + time_str)
            print('-------------------------------------------------------')
            if task_index == 0:
                hopshdfs.log(time_str)
        except:
            #Always do cleanup
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            if task_index == 0:
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
Example #2
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_pid = 0
        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        try:
            #Arguments
            if args_dict:
                argcount = six.get_function_code(map_fun).co_argcount
                names = six.get_function_code(map_fun).co_varnames

                args = []
                argIndex = 0
                param_string = ''
                while argcount > 0:
                    #Get args for executor and run function
                    param_name = names[argIndex]
                    param_val = args_dict[param_name][executor_num]
                    param_string += str(param_name) + '=' + str(
                        param_val) + '.'
                    args.append(param_val)
                    argcount -= 1
                    argIndex += 1
                param_string = param_string[:-1]
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, param_string, 'launcher')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)

                gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
                )
                hopshdfs.log(gpu_str)
                print(gpu_str)
                print(
                    '-------------------------------------------------------')
                print('Started running task ' + param_string + '\n')
                hopshdfs.log('Started running task ' + param_string)
                task_start = datetime.datetime.now()
                map_fun(*args)
                task_end = datetime.datetime.now()
                time_str = 'Finished task ' + param_string + ' - took ' + util._time_diff(
                    task_start, task_end)
                print('\n' + time_str)
                print(
                    '-------------------------------------------------------')
                hopshdfs.log(time_str)
            else:
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, None, 'launcher')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
                gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
                )
                hopshdfs.log(gpu_str)
                print(gpu_str)
                print(
                    '-------------------------------------------------------')
                print('Started running task\n')
                hopshdfs.log('Started running task')
                task_start = datetime.datetime.now()
                retval = map_fun()
                task_end = datetime.datetime.now()
                if retval:
                    _handle_return(retval, hdfs_exec_logdir)
                time_str = 'Finished task - took ' + util._time_diff(
                    task_start, task_end)
                print('\n' + time_str)
                print(
                    '-------------------------------------------------------')
                hopshdfs.log(time_str)
        except:
            #Always do cleanup
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            try:
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)
            except:
                pass

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
Example #3
0
def _grid_launch(sc, map_fun, args_dict, direction='max', local_logdir=False, name="no-name"):
    """
    Run the wrapper function with each hyperparameter combination as specified by the dictionary

    Args:
        sc:
        map_fun:
        args_dict:
        direction:
        local_logdir:
        name:

    Returns:

    """
    global run_id
    app_id = str(sc.applicationId)
    num_executions = 1

    if direction != 'max' and direction != 'min':
        raise ValueError('Invalid direction ' + direction +  ', must be max or min')

    arg_lists = list(args_dict.values())
    currentLen = len(arg_lists[0])
    for i in range(len(arg_lists)):
        if currentLen != len(arg_lists[i]):
            raise ValueError('Length of each function argument list must be equal')
        num_executions = len(arg_lists[i])

    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(num_executions), num_executions)

    #Make SparkUI intuitive by grouping jobs
    sc.setJobGroup("Grid Search", "{} | Hyperparameter Optimization".format(name))

    #Force execution on executor, since GPU is located on executor
    job_start = datetime.datetime.now()
    nodeRDD.foreachPartition(_prepare_func(app_id, run_id, map_fun, args_dict, local_logdir))
    job_end = datetime.datetime.now()

    job_time_str = util._time_diff(job_start, job_end)

    arg_count = six.get_function_code(map_fun).co_argcount
    arg_names = six.get_function_code(map_fun).co_varnames
    hdfs_appid_dir = hopshdfs._get_experiments_dir() + '/' + app_id
    hdfs_runid_dir = _get_logdir(app_id)

    max_val, max_hp, min_val, min_hp, avg = _get_best(args_dict, num_executions, arg_names, arg_count, hdfs_appid_dir, run_id)

    param_combination = ""
    best_val = ""

    if direction == 'max':
        param_combination = max_hp
        best_val = str(max_val)
        results = '\n------ Grid Search results ------ direction(' + direction + ') \n' \
          'BEST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \
          'WORST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \
          'AVERAGE metric -- ' + str(avg) + '\n' \
          'Total job time ' + job_time_str + '\n'
        _write_result(hdfs_runid_dir, results)
        print(results)
    elif direction == 'min':
        param_combination = min_hp
        best_val = str(min_val)
        results = '\n------ Grid Search results ------ direction(' + direction + ') \n' \
        'BEST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \
        'WORST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \
        'AVERAGE metric -- ' + str(avg) + '\n' \
        'Total job time ' + job_time_str + '\n'
        _write_result(hdfs_runid_dir, results)
        print(results)


    print('Finished Experiment \n')

    return hdfs_runid_dir, param_combination, best_val
Example #4
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        role = None

        client = parameter_server_reservation.Client(server_addr)

        try:
            host = util._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]
            host_port = host + ":" + str(port)

            exec_spec = {}
            if executor_num < num_ps:
                exec_spec["task_type"] = "ps"
            else:
                exec_spec["task_type"] = "worker"
            exec_spec["host_port"] = host_port
            exec_spec["gpus_present"] = devices.get_num_gpus() > 0

            client.register(exec_spec)

            cluster = client.await_reservations()

            tmp_socket.close()

            role, index = _find_task_and_index(host_port, cluster)

            cluster_spec = {}
            cluster_spec["cluster"] = cluster
            cluster_spec["task"] = {"type": role, "index": index}

            print(cluster_spec)

            os.environ["TF_CONFIG"] = json.dumps(cluster_spec)

            if role == "chief":
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, None, 'parameter_server')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
            gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
            )
            if role == "chief":
                hopshdfs.log(gpu_str)
            print(gpu_str)
            print('-------------------------------------------------------')
            print('Started running task \n')
            if role == "chief":
                hopshdfs.log('Started running task')
            task_start = datetime.datetime.now()

            retval = None
            if role == "ps":
                ps_thread = threading.Thread(target=lambda: map_fun())
                ps_thread.start()
                print("waiting for workers")
                client.await_all_workers_finished()
                print("waiting finished")
            else:
                retval = map_fun()

            if role == "chief":
                if retval:
                    _handle_return(retval, hdfs_exec_logdir)

            task_end = datetime.datetime.now()
            time_str = 'Finished task - took ' + util._time_diff(
                task_start, task_end)
            print('\n' + time_str)
            print('-------------------------------------------------------')
            if role == "chief":
                hopshdfs.log(time_str)
        except:
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            if role == "chief":
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)
            try:
                if role == "worker" or role == "chief":
                    client.register_worker_finished()
                client.close()
            except:
                pass

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
    def _wrapper_fun(iter):
        """

        Args:
            :iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_pid = 0
        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        global local_logdir_bool

        try:
            #Arguments
            if args_dict:
                argcount = six.get_function_code(map_fun).co_argcount
                names = six.get_function_code(map_fun).co_varnames

                args = []
                argIndex = 0
                param_string = ''
                while argcount > 0:
                    #Get args for executor and run function
                    param_name = names[argIndex]
                    param_val = args_dict[param_name][executor_num]
                    param_string += str(param_name) + '=' + str(
                        param_val) + '.'
                    args.append(param_val)
                    argcount -= 1
                    argIndex += 1
                param_string = param_string[:-1]

                val = _get_metric(param_string, app_id, generation_id, run_id)
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id,
                    run_id,
                    param_string,
                    'differential_evolution',
                    sub_type='generation.' + str(generation_id))
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir_bool)
                gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
                )
                hopshdfs.log(gpu_str)
                print(gpu_str)
                print(
                    '-------------------------------------------------------')
                print('Started running task ' + param_string + '\n')
                if val:
                    print('Reading returned metric from previous run: ' +
                          str(val))
                hopshdfs.log('Started running task ' + param_string)
                task_start = datetime.datetime.now()
                if not val:
                    val = map_fun(*args)
                task_end = datetime.datetime.now()
                time_str = 'Finished task ' + param_string + ' - took ' + util._time_diff(
                    task_start, task_end)
                print('\n' + time_str)
                hopshdfs.log(time_str)
                try:
                    castval = int(val)
                except:
                    raise ValueError(
                        'Your function needs to return a metric (number) which should be maximized or minimized'
                    )

                metric_file = hdfs_exec_logdir + '/metric'
                fs_handle = hopshdfs.get_fs()
                try:
                    fd = fs_handle.open_file(metric_file, mode='w')
                except:
                    fd = fs_handle.open_file(metric_file, flags='w')

                fd.write(str(float(val)).encode())
                fd.flush()
                fd.close()
                print('Returning metric ' + str(val))
                print(
                    '-------------------------------------------------------')
        except:
            #Always do cleanup
            if tb_hdfs_path:
                _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join(20)
            raise
        finally:
            if local_logdir_bool:
                local_tb = tensorboard.local_logdir_path
                util._store_local_tensorboard(local_tb, hdfs_exec_logdir)

        hopshdfs.log('Finished running')
        if tb_hdfs_path:
            _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join(20)
Example #6
0
def _launch(sc,
            map_fun,
            args_dict,
            samples,
            direction='max',
            local_logdir=False,
            name="no-name"):
    """

    Args:
        sc:
        map_fun:
        args_dict:
        local_logdir:
        name:

    Returns:

    """
    global run_id

    app_id = str(sc.applicationId)

    arg_lists = list(args_dict.values())
    for i in range(len(arg_lists)):
        if len(arg_lists[i]) != 2:
            raise ValueError(
                'Boundary list must contain exactly two elements, [lower_bound, upper_bound] for each hyperparameter'
            )

    hp_names = args_dict.keys()

    random_dict = {}
    for hp in hp_names:
        lower_bound = args_dict[hp][0]
        upper_bound = args_dict[hp][1]

        assert lower_bound < upper_bound, "lower bound: " + str(
            lower_bound) + " must be less than upper bound: " + str(
                upper_bound)

        random_values = []

        if type(lower_bound) == int and type(upper_bound) == int:
            for i in range(samples):
                random_values.append(random.randint(lower_bound, upper_bound))
        elif type(lower_bound) == float and type(upper_bound) == float:
            for i in range(samples):
                random_values.append(random.uniform(lower_bound, upper_bound))
        else:
            raise ValueError('Only float and int is currently supported')

        random_dict[hp] = random_values

    random_dict, new_samples = _remove_duplicates(random_dict, samples)

    sc.setJobGroup("Random Search",
                   "{} | Hyperparameter Optimization".format(name))
    #Each TF task should be run on 1 executor
    nodeRDD = sc.parallelize(range(new_samples), new_samples)

    job_start = datetime.datetime.now()
    nodeRDD.foreachPartition(
        _prepare_func(app_id, run_id, map_fun, random_dict, local_logdir))
    job_end = datetime.datetime.now()

    job_time_str = util._time_diff(job_start, job_end)

    arg_count = six.get_function_code(map_fun).co_argcount
    arg_names = six.get_function_code(map_fun).co_varnames
    hdfs_appid_dir = hopshdfs._get_experiments_dir() + '/' + app_id
    hdfs_runid_dir = _get_logdir(app_id)

    max_val, max_hp, min_val, min_hp, avg = _get_best(random_dict, new_samples,
                                                      arg_names, arg_count,
                                                      hdfs_appid_dir, run_id)

    param_combination = ""
    best_val = ""

    if direction == 'max':
        param_combination = max_hp
        best_val = str(max_val)
        results = '\n------ Random Search results ------ direction(' + direction + ') \n' \
        'BEST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \
        'WORST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \
        'AVERAGE metric -- ' + str(avg) + '\n' \
        'Total job time ' + job_time_str + '\n'
        _write_result(hdfs_runid_dir, results)
        print(results)
    elif direction == 'min':
        param_combination = min_hp
        best_val = str(min_val)
        results = '\n------ Random Search results ------ direction(' + direction + ') \n' \
        'BEST combination ' + min_hp + ' -- metric ' + str(min_val) + '\n' \
        'WORST combination ' + max_hp + ' -- metric ' + str(max_val) + '\n' \
        'AVERAGE metric -- ' + str(avg) + '\n' \
        'Total job time ' + job_time_str + '\n'
        _write_result(hdfs_runid_dir, results)
        print(results)

    print('Finished Experiment \n')

    return hdfs_runid_dir, param_combination, best_val
Example #7
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_pid = 0
        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        try:
            hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                app_id, run_id, None, 'mirrored')
            pydoop.hdfs.dump('',
                             os.environ['EXEC_LOGFILE'],
                             user=hopshdfs.project_user())
            hopshdfs._init_logger()
            tb_hdfs_path, tb_pid = tensorboard._register(
                hdfs_exec_logdir,
                hdfs_appid_logdir,
                executor_num,
                local_logdir=local_logdir)
            gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
            )
            hopshdfs.log(gpu_str)
            print(gpu_str)
            print('-------------------------------------------------------')
            print('Started running task\n')
            hopshdfs.log('Started running task')
            task_start = datetime.datetime.now()
            retval = map_fun()
            task_end = datetime.datetime.now()
            if retval:
                _handle_return(retval, hdfs_exec_logdir)
            time_str = 'Finished task - took ' + util._time_diff(
                task_start, task_end)
            print('\n' + time_str)
            print('-------------------------------------------------------')
            hopshdfs.log(time_str)
        except:
            #Always do cleanup
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            try:
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)
            except:
                pass

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()