コード例 #1
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        experiment_utils._set_ml_id(app_id, run_id)

        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        try:
            #Arguments
            if args_dict:
                param_string, params, args = experiment_utils.build_parameters(
                    map_fun, executor_num, args_dict)
                hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories(
                    app_id,
                    run_id,
                    param_string,
                    'random_search',
                    params=params)
                logfile = experiment_utils._init_logger(hdfs_exec_logdir)
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
                print(devices._get_gpu_info())
                print(
                    '-------------------------------------------------------')
                print('Started running task ' + param_string)
                task_start = time.time()
                retval = map_fun(*args)
                task_end = time.time()
                experiment_utils._handle_return(retval, hdfs_exec_logdir,
                                                optimization_key, logfile)
                time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff(
                    task_start, task_end)
                print(time_str)
                print('Returning metric ' + str(retval))
                print(
                    '-------------------------------------------------------')
        except:
            raise
        finally:
            experiment_utils._cleanup(tensorboard, t)
コード例 #2
0
    def _wrapper_fun(iter):
        """

        Args:
            :iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        experiment_utils._set_ml_id(app_id, run_id)

        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        global local_logdir_bool

        try:
            #Arguments
            if args_dict:
                param_string, params, args = experiment_utils.build_parameters(map_fun, executor_num, args_dict)
                val = _get_return_file(param_string, app_id, generation_id, run_id)
                hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories(app_id, run_id, param_string, 'differential_evolution', sub_type='generation.' + str(generation_id), params=params)
                logfile = experiment_utils._init_logger(hdfs_exec_logdir)
                tb_hdfs_path, tb_pid = tensorboard._register(hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir_bool)
                print(devices._get_gpu_info())
                print('-------------------------------------------------------')
                print('Started running task ' + param_string)
                if val is not None:
                    val = json.loads(val)
                task_start = time.time()
                if val is None:
                    val = map_fun(*args)
                task_end = time.time()
                time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff(task_start, task_end)
                print(time_str)
                experiment_utils._handle_return(val, hdfs_exec_logdir, opt_key, logfile)
                print('Returning metric ' + str(val))
                print('-------------------------------------------------------')
        except:
            raise
        finally:
            experiment_utils._cleanup(tensorboard, t)
コード例 #3
0
ファイル: launcher.py プロジェクト: tkakantousis/hops-util-py
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_pid = 0
        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        try:
            #Arguments
            if args_dict:
                argcount = six.get_function_code(map_fun).co_argcount
                names = six.get_function_code(map_fun).co_varnames

                args = []
                argIndex = 0
                param_string = ''
                while argcount > 0:
                    #Get args for executor and run function
                    param_name = names[argIndex]
                    param_val = args_dict[param_name][executor_num]
                    param_string += str(param_name) + '=' + str(
                        param_val) + '.'
                    args.append(param_val)
                    argcount -= 1
                    argIndex += 1
                param_string = param_string[:-1]
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, param_string, 'launcher')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)

                gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
                )
                hopshdfs.log(gpu_str)
                print(gpu_str)
                print(
                    '-------------------------------------------------------')
                print('Started running task ' + param_string + '\n')
                hopshdfs.log('Started running task ' + param_string)
                task_start = datetime.datetime.now()
                map_fun(*args)
                task_end = datetime.datetime.now()
                time_str = 'Finished task ' + param_string + ' - took ' + util._time_diff(
                    task_start, task_end)
                print('\n' + time_str)
                print(
                    '-------------------------------------------------------')
                hopshdfs.log(time_str)
            else:
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, None, 'launcher')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
                gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
                )
                hopshdfs.log(gpu_str)
                print(gpu_str)
                print(
                    '-------------------------------------------------------')
                print('Started running task\n')
                hopshdfs.log('Started running task')
                task_start = datetime.datetime.now()
                retval = map_fun()
                task_end = datetime.datetime.now()
                if retval:
                    _handle_return(retval, hdfs_exec_logdir)
                time_str = 'Finished task - took ' + util._time_diff(
                    task_start, task_end)
                print('\n' + time_str)
                print(
                    '-------------------------------------------------------')
                hopshdfs.log(time_str)
        except:
            #Always do cleanup
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            try:
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)
            except:
                pass

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
コード例 #4
0
ファイル: agent.py プロジェクト: ErmiasG/kagent-chef
    def send(self):
        global logged_in
        global session
        if not logged_in:
            logger.info('Logging in to Hopsworks....')
            Heartbeat.login()
        else:
            system_status_to_delete = []
            try:
                logger.debug("Creating heartbeat reply...")
                disk_info = DiskInfo()
                memory_info = MemoryInfo()
                load_info = LoadInfo()
                services_list = self.construct_services_status()
                now = long(time.mktime(datetime.now().timetuple()))
                headers = {'content-type': 'application/json'}
                payload = {}
                payload["num-gpus"] = devices.get_num_gpus()
                payload["host-id"] = kconfig.host_id
                payload["agent-time"] = now
                payload["services"] = services_list
                payload["recover"] = self._recover

                self._system_commands_status_mutex.acquire()
                system_commands_response = []
                # Append command status to response
                for k, v in self._system_commands_status.iteritems():
                    system_commands_response.append(v)
                    system_status_to_delete.append(v)

                # Remove status from local statuses state
                for command_to_delete in system_status_to_delete:
                    del self._system_commands_status[command_to_delete['id']]
                self._system_commands_status_mutex.release()
                payload["system-commands"] = system_commands_response

                if (kconfig.private_ip != None):
                    payload["private-ip"] = kconfig.private_ip
                else:
                    payload["private-ip"] = ""

                payload["cores"] = cores
                payload['memory-capacity'] = memory_info.total
                logger.debug("Sending heartbeat...")
                resp = session.post(kconfig.heartbeat_url,
                                    data=json.dumps(payload),
                                    headers=headers,
                                    verify=False)
                logger.debug("Received heartbeat response")
                if not resp.status_code == HTTP_OK:
                    # Put back deleted statuses if command ID does not exist in order to be re-send
                    self._system_commands_status_mutex.acquire()
                    for restore_command in system_status_to_delete:
                        if restore_command[
                                'id'] not in self._system_commands_status:
                            self._system_commands_status[
                                restore_command['id']] = restore_command
                    self._system_commands_status_mutex.release()

                    logged_in = False
                    raise Exception(
                        'Heartbeat could not be sent (Status code: {0})'.
                        format(resp.status_code))
                else:
                    theResponse = resp.json()
                    logger.debug(
                        "Response from heartbeat is: {0}".format(theResponse))
                    self._recover = False
                    try:
                        system_commands = theResponse['system-commands']
                        for command in system_commands:
                            c = Command('SYSTEM_COMMAND', command)
                            logger.debug(
                                "Adding SYSTEM command with ID {0} and status {1} to Handler Queue"
                                .format(command['id'], command['status']))
                            commands_queue.put(c)
                            command['status'] = 'ONGOING'
                            self._system_commands_status_mutex.acquire()
                            self._system_commands_status[
                                command['id']] = command
                            self._system_commands_status_mutex.release()

                    except Exception as err:
                        logger.info("No commands to execute")

            except Exception as err:
                logger.error("{0}. Retrying in {1} seconds...".format(
                    err, kconfig.heartbeat_interval))
                logged_in = False
コード例 #5
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        task_index = None

        try:
            host = util._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]

            client = allreduce_reservation.Client(server_addr)
            host_port = host + ":" + str(port)

            client.register({"worker": host_port, "index": executor_num})
            cluster = client.await_reservations()
            tmp_socket.close()
            client.close()

            task_index = _find_index(host_port, cluster)

            cluster["task"] = {"type": "worker", "index": task_index}

            os.environ["TF_CONFIG"] = json.dumps(cluster)

            if task_index == 0:
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, None, 'collective_all_reduce')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
            gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
            )
            if task_index == 0:
                hopshdfs.log(gpu_str)
            print(gpu_str)
            print('-------------------------------------------------------')
            print('Started running task \n')
            if task_index == 0:
                hopshdfs.log('Started running task')
            task_start = datetime.datetime.now()

            retval = map_fun()
            if task_index == 0:
                if retval:
                    _handle_return(retval, hdfs_exec_logdir)
            task_end = datetime.datetime.now()
            time_str = 'Finished task - took ' + util._time_diff(
                task_start, task_end)
            print('\n' + time_str)
            print('-------------------------------------------------------')
            if task_index == 0:
                hopshdfs.log(time_str)
        except:
            #Always do cleanup
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            if task_index == 0:
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
コード例 #6
0
    def _wrapper_fun(iter):

        for i in iter:
            executor_num = i

        tb_pid = 0
        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices.print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        global local_logdir_bool

        try:
            #Arguments
            if args_dict:
                argcount = six.get_function_code(map_fun).co_argcount
                names = six.get_function_code(map_fun).co_varnames

                args = []
                argIndex = 0
                param_string = ''
                while argcount > 0:
                    #Get args for executor and run function
                    param_name = names[argIndex]
                    param_val = args_dict[param_name][executor_num]
                    param_string += str(param_name) + '=' + str(
                        param_val) + '.'
                    args.append(param_val)
                    argcount -= 1
                    argIndex += 1
                param_string = param_string[:-1]

                val = _get_metric(param_string, app_id, generation_id, run_id)
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
                    app_id,
                    run_id,
                    param_string,
                    'differential_evolution',
                    sub_type='generation.' + str(generation_id))
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs.init_logger()
                tb_hdfs_path, tb_pid = tensorboard.register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir_bool)
                gpu_str = '\nChecking for GPUs in the environment' + devices.get_gpu_info(
                )
                hopshdfs.log(gpu_str)
                print(gpu_str)
                print(
                    '-------------------------------------------------------')
                print('Started running task ' + param_string + '\n')
                if val:
                    print('Reading returned metric from previous run: ' +
                          str(val))
                hopshdfs.log('Started running task ' + param_string)
                task_start = datetime.datetime.now()
                if not val:
                    val = map_fun(*args)
                task_end = datetime.datetime.now()
                time_str = 'Finished task ' + param_string + ' - took ' + util.time_diff(
                    task_start, task_end)
                print('\n' + time_str)
                hopshdfs.log(time_str)
                try:
                    castval = int(val)
                except:
                    raise ValueError(
                        'Your function needs to return a metric (number) which should be maximized or minimized'
                    )

                metric_file = hdfs_exec_logdir + '/metric'
                fs_handle = hopshdfs.get_fs()
                try:
                    fd = fs_handle.open_file(metric_file, mode='w')
                except:
                    fd = fs_handle.open_file(metric_file, flags='w')

                fd.write(str(float(val)).encode())
                fd.flush()
                fd.close()
                print('Returning metric ' + str(val))
                print(
                    '-------------------------------------------------------')
        except:
            #Always do cleanup
            if tb_hdfs_path:
                _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            if local_logdir_bool:
                local_tb = tensorboard.local_logdir_path
                util.store_local_tensorboard(local_tb, hdfs_exec_logdir)

        hopshdfs.log('Finished running')
        if tb_hdfs_path:
            _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
コード例 #7
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        experiment_utils._set_ml_id(app_id, run_id)

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        is_chief = False
        logdir = None
        tb_hdfs_path = None
        try:
            host = experiment_utils._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]

            client = allreduce_reservation.Client(server_addr)
            host_port = host + ":" + str(port)

            client.register({"worker": host_port, "index": executor_num})
            cluster = client.await_reservations()
            tmp_socket.close()
            client.close()

            task_index = experiment_utils._find_index(host_port, cluster)

            if task_index == -1:
                cluster["task"] = {"type": "chief", "index": 0}
            else:
                cluster["task"] = {"type": "worker", "index": task_index}

            evaluator_node = None
            if evaluator:
                last_worker_index = len(cluster["cluster"]["worker"]) - 1
                evaluator_node = cluster["cluster"]["worker"][
                    last_worker_index]
                cluster["cluster"]["evaluator"] = [evaluator_node]
                del cluster["cluster"]["worker"][last_worker_index]
                if evaluator_node == host_port:
                    cluster["task"] = {"type": "evaluator", "index": 0}

            print('TF_CONFIG: {} '.format(cluster))

            if num_executors > 1:
                os.environ["TF_CONFIG"] = json.dumps(cluster)

            is_chief = (cluster["task"]["type"] == "chief")

            is_evaluator = (cluster["task"]["type"] == "evaluator")

            if is_chief:
                logdir = experiment_utils._get_logdir(app_id, run_id)
                tb_hdfs_path, tb_pid = tensorboard._register(
                    logdir, logdir, executor_num, local_logdir=local_logdir)
            elif is_evaluator:
                logdir = experiment_utils._get_logdir(app_id, run_id)
                tensorboard.events_logdir = logdir

            logfile = experiment_utils._init_logger(
                experiment_utils._get_logdir(app_id, run_id),
                role=cluster["task"]["type"],
                index=cluster["task"]["index"])

            print(devices._get_gpu_info())
            print('-------------------------------------------------------')
            print('Started running task')
            task_start = time.time()
            retval = map_fun()

            if is_chief:
                experiment_utils._handle_return_simple(
                    retval, experiment_utils._get_logdir(app_id, run_id),
                    logfile)

            task_end = time.time()
            time_str = 'Finished task - took ' + experiment_utils._time_diff(
                task_start, task_end)
            print(time_str)
            print('-------------------------------------------------------')
        except:
            raise
        finally:
            experiment_utils._cleanup(tensorboard, t)
コード例 #8
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        role = None

        client = parameter_server_reservation.Client(server_addr)

        try:
            host = util._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]
            host_port = host + ":" + str(port)

            exec_spec = {}
            if executor_num < num_ps:
                exec_spec["task_type"] = "ps"
            else:
                exec_spec["task_type"] = "worker"
            exec_spec["host_port"] = host_port
            exec_spec["gpus_present"] = devices.get_num_gpus() > 0

            client.register(exec_spec)

            cluster = client.await_reservations()

            tmp_socket.close()

            role, index = _find_task_and_index(host_port, cluster)

            cluster_spec = {}
            cluster_spec["cluster"] = cluster
            cluster_spec["task"] = {"type": role, "index": index}

            print(cluster_spec)

            os.environ["TF_CONFIG"] = json.dumps(cluster_spec)

            if role == "chief":
                hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                    app_id, run_id, None, 'parameter_server')
                pydoop.hdfs.dump('',
                                 os.environ['EXEC_LOGFILE'],
                                 user=hopshdfs.project_user())
                hopshdfs._init_logger()
                tb_hdfs_path, tb_pid = tensorboard._register(
                    hdfs_exec_logdir,
                    hdfs_appid_logdir,
                    executor_num,
                    local_logdir=local_logdir)
            gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
            )
            if role == "chief":
                hopshdfs.log(gpu_str)
            print(gpu_str)
            print('-------------------------------------------------------')
            print('Started running task \n')
            if role == "chief":
                hopshdfs.log('Started running task')
            task_start = datetime.datetime.now()

            retval = None
            if role == "ps":
                ps_thread = threading.Thread(target=lambda: map_fun())
                ps_thread.start()
                print("waiting for workers")
                client.await_all_workers_finished()
                print("waiting finished")
            else:
                retval = map_fun()

            if role == "chief":
                if retval:
                    _handle_return(retval, hdfs_exec_logdir)

            task_end = datetime.datetime.now()
            time_str = 'Finished task - took ' + util._time_diff(
                task_start, task_end)
            print('\n' + time_str)
            print('-------------------------------------------------------')
            if role == "chief":
                hopshdfs.log(time_str)
        except:
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            if role == "chief":
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)
            try:
                if role == "worker" or role == "chief":
                    client.register_worker_finished()
                client.close()
            except:
                pass

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
コード例 #9
0
ファイル: TFSparkNode.py プロジェクト: Limmen/hops-util-py
    def _mapfn(iter):

        # Note: consuming the input iterator helps Pyspark re-use this worker,
        for i in iter:
            executor_id = i

        # assign TF job/task based on provided cluster_spec template (or use default/null values)
        job_name = 'default'
        task_index = -1
        cluster_id = cluster_meta['id']
        cluster_template = cluster_meta['cluster_template']
        for jobtype in cluster_template:
            nodes = cluster_template[jobtype]
            if executor_id in nodes:
                job_name = jobtype
                task_index = nodes.index(executor_id)
                break

        # get unique key (hostname, executor_id) for this executor
        host = util.get_ip_address()
        util.write_executor_id(executor_id)
        port = 0

        # check for existing TFManagers
        if TFSparkNode.mgr is not None and str(
                TFSparkNode.mgr.get('state')) != "'stopped'":
            if TFSparkNode.cluster_id == cluster_id:
                # raise an exception to force Spark to retry this "reservation" task on another executor
                raise Exception(
                    "TFManager already started on {0}, executor={1}, state={2}"
                    .format(host, executor_id,
                            str(TFSparkNode.mgr.get("state"))))
            else:
                # old state, just continue with creating new manager
                logging.warn(
                    "Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}"
                    .format(TFSparkNode.cluster_id, cluster_id))

        gpu_present = gpu_info.detect_gpu_present()

        client = reservation.Client(cluster_meta['server_addr'])

        logging.info("TFSparkNode.run register: {0}".format(gpu_present))
        client.register_gpu_presence(gpu_present)

        gpus_are_present_on_executors = client.await_gpu_check()
        logging.info("TFSparkNode.run await_gpu_check: {0}".format(
            gpus_are_present_on_executors))

        # check for existing TFManagers
        if TFSparkNode.mgr is not None and str(
                TFSparkNode.mgr.get('state')) != "'stopped'":
            if TFSparkNode.cluster_id == cluster_id:
                # raise an exception to force Spark to retry this "reservation" task on another executor
                raise Exception(
                    "TFManager already started on {0}, state={1}".format(
                        host, str(TFSparkNode.mgr.get("state"))))
            else:
                # old state, just continue with creating new manager
                logging.warn(
                    "Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}"
                    .format(TFSparkNode.cluster_id, cluster_id))

            # start a TFManager and get a free port
            # use a random uuid as the authkey
        authkey = uuid.uuid4().bytes
        addr = None

        if (gpus_are_present_on_executors):
            #Valid PS, does not have GPUs, will be started as a PS
            if job_name == 'ps' and gpu_present == False:
                # PS nodes must be remotely accessible in order to shutdown from Spark driver.
                TFSparkNode.mgr = TFManager.start(authkey,
                                                  ['control', 'error'],
                                                  'remote')
                addr = (host, TFSparkNode.mgr.address[1])

            #Invalid worker, all workers should have GPUs, this one will assume role as PS
            elif job_name == 'worker' and gpu_present == False:
                # PS nodes must be remotely accessible in order to shutdown from Spark driver.
                TFSparkNode.mgr = TFManager.start(authkey,
                                                  ['control', 'error'],
                                                  'remote')
                addr = (host, TFSparkNode.mgr.address[1])

            #Correct worker
            else:
                # worker nodes only need to be locally accessible within the executor for data feeding
                TFSparkNode.mgr = TFManager.start(authkey, queues)
                addr = TFSparkNode.mgr.address
        else:
            if job_name == 'ps':
                # PS nodes must be remotely accessible in order to shutdown from Spark driver.
                TFSparkNode.mgr = TFManager.start(authkey,
                                                  ['control', 'error'],
                                                  'remote')
                addr = (host, TFSparkNode.mgr.address[1])
            else:
                # worker nodes only need to be locally accessible within the executor for data feeding
                TFSparkNode.mgr = TFManager.start(authkey, queues)
                addr = TFSparkNode.mgr.address

        # initialize mgr state
        TFSparkNode.mgr.set('state', 'running')
        TFSparkNode.cluster_id = cluster_id

        # expand Hadoop classpath wildcards for JNI (Spark 2.x)
        if 'HADOOP_PREFIX' in os.environ:
            classpath = os.environ['CLASSPATH']
            hadoop_path = os.path.join(os.environ['HADOOP_PREFIX'], 'bin',
                                       'hadoop')
            hadoop_classpath = subprocess.check_output(
                [hadoop_path, 'classpath', '--glob']).decode()
            logging.debug("CLASSPATH: {0}".format(hadoop_classpath))
            os.environ['CLASSPATH'] = classpath + os.pathsep + hadoop_classpath

        # start TensorBoard if requested
        tb_pid = 0
        tb_port = 0
        # check server to see if this task is being retried (i.e. already reserved)
        client = reservation.Client(cluster_meta['server_addr'])
        cluster_info = client.get_reservations()
        tmp_sock = None
        node_meta = None
        for node in cluster_info:
            (nhost, nexec) = (node['host'], node['executor_id'])
            if nhost == host and nexec == executor_id:
                node_meta = node
                port = node['port']

        # if not already done, register everything we need to set up the cluster
        if node_meta is None:
            # first, find a free port for TF
            tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
            tmp_sock.bind(('', port))
            port = tmp_sock.getsockname()[1]

            node_meta = {
                'executor_id': executor_id,
                'host': host,
                'job_name': job_name,
                'task_index': task_index,
                'port': port,
                'tb_pid': tb_pid,
                'tb_port': tb_port,
                'addr': addr,
                'authkey': authkey,
                'gpu_present': gpu_present
            }

            # register node metadata with server
            logging.info("TFSparkNode.run register: {0}".format(node_meta))
            client.register(node_meta)
            # wait for other nodes to finish reservations
            cluster_info = client.await_reservations()
            logging.info(
                "TFSparkNode.run await_reservations: {0}".format(cluster_info))
            client.close()

        # construct a TensorFlow clusterspec from cluster_info
        sorted_cluster_info = sorted(cluster_info,
                                     key=lambda k: k['executor_id'])
        spec = {}
        last_executor_id = -1
        for node in sorted_cluster_info:
            if (node['executor_id'] == last_executor_id):
                raise Exception("Duplicate worker/task in cluster_info")
            last_executor_id = node['executor_id']
            logging.info("node: {0}".format(node))
            (njob, nhost, nport) = (node['job_name'], node['host'],
                                    node['port'])
            hosts = [] if njob not in spec else spec[njob]
            hosts.append("{0}:{1}".format(nhost, nport))
            spec[njob] = hosts

        for node in cluster_info:
            if ((node_meta['host'] == node['host'])
                    and (node_meta['authkey'] == node['authkey'])):
                job_name = node['job_name']
                task_index = node['task_index']
                executor_id = node['executor_id']
                break

        hdfs_exec_logdir = ''
        if gpus_are_present_on_executors and gpu_present and job_name == 'worker' and task_index == 0:
            # When running with GPUs
            hdfs_exec_logdir, hdfs_appid_logdir = hdfs.create_directories(
                app_id, run_id, None, 'tensorflowonspark')
            tb_proc = tensorboard.register(hdfs_exec_logdir,
                                           hdfs_appid_logdir,
                                           0,
                                           local_logdir=local_logdir)
        elif not gpus_are_present_on_executors and job_name == 'worker' and task_index == 0:
            # When running with no GPUs
            hdfs_exec_logdir, hdfs_appid_logdir = hdfs.create_directories(
                app_id, run_id, None, 'tensorflowonspark')
            tb_proc = tensorboard.register(hdfs_exec_logdir,
                                           hdfs_appid_logdir,
                                           0,
                                           local_logdir=local_logdir)

        # construct a TensorFlow clusterspec from cluster_info
        sorted_cluster_info = sorted(cluster_info,
                                     key=lambda k: k['executor_id'])
        spec = {}
        for node in sorted_cluster_info:
            logging.info("node: {0}".format(node))
            (njob, nhost, nport) = (node['job_name'], node['host'],
                                    node['port'])
            hosts = [] if njob not in spec else spec[njob]
            hosts.append("{0}:{1}".format(nhost, nport))
            spec[njob] = hosts

        # update TF_CONFIG and reserve GPU for tf.estimator based code
        # Note: this will execute but be ignored by non-tf.estimator code
        tf_config = json.dumps({
            'cluster': spec,
            'task': {
                'type': job_name,
                'index': task_index
            },
            'environment': 'cloud'
        })
        os.environ['TF_CONFIG'] = tf_config

        # create a context object to hold metadata for TF
        ctx = TFNodeContext(executor_id, job_name, task_index, spec,
                            cluster_meta['default_fs'],
                            cluster_meta['working_dir'], TFSparkNode.mgr)

        # release port reserved for TF as late as possible
        if tmp_sock is not None:
            tmp_sock.close()

        # Background mode relies reuse of python worker in Spark.
        if background:
            # However, reuse of python worker can't work on Windows, we need to check if the current
            # script runs on Windows or not.
            if os.name == 'nt' or platform.system() == 'Windows':
                raise Exception("Background mode is not supported on Windows.")
            # Check if the config of reuse python worker is enabled on Spark.
            if not os.environ.get("SPARK_REUSE_WORKER"):
                raise Exception(
                    "Background mode relies reuse of python worker on Spark. This config 'spark.python.worker.reuse' is not enabled on Spark. Please enable it before using background."
                )

        def wrapper_fn(args, context):
            """Wrapper function that sets the sys.argv of the executor."""
            if isinstance(args, list):
                sys.argv = args
            fn(args, context)

        def wrapper_fn_background(args, context):
            """Wrapper function that signals exceptions to foreground process."""
            errq = TFSparkNode.mgr.get_queue('error')
            try:
                wrapper_fn(args, context)
            except Exception:
                errq.put(traceback.format_exc())
                errq.join()

        if job_name == 'ps' or background:
            # invoke the TensorFlow main function in a background thread
            logging.info(
                "Starting TensorFlow {0}:{1} as {2} on cluster node {3} on background process"
                .format(job_name, task_index, job_name, executor_id))

            p = multiprocessing.Process(target=wrapper_fn_background,
                                        args=(tf_args, ctx))
            if job_name == 'ps':
                p.daemon = True
            p.start()

            # for ps nodes only, wait indefinitely in foreground thread for a "control" event (None == "stop")
            if job_name == 'ps':
                queue = TFSparkNode.mgr.get_queue('control')
                equeue = TFSparkNode.mgr.get_queue('error')
                done = False
                while not done:
                    while (queue.empty() and equeue.empty()):
                        time.sleep(1)
                    if (not equeue.empty()):
                        e_str = equeue.get()
                        equeue.task_done()
                        raise Exception("exception in ps:\n" + e_str)
                    msg = queue.get(block=True)
                    logging.info("Got msg: {0}".format(msg))
                    if msg == None:
                        logging.info("Terminating PS")
                        TFSparkNode.mgr.set('state', 'stopped')
                        done = True
                    queue.task_done()
        else:

            t = threading.Thread(target=devices.print_periodic_gpu_utilization)
            if devices.get_num_gpus() > 0:
                t.start()

            # otherwise, just run TF function in the main executor/worker thread
            logging.info(
                "Starting TensorFlow {0}:{1} on cluster node {2} on foreground thread"
                .format(job_name, task_index, executor_id))
            try:
                wrapper_fn(tf_args, ctx)
            except:
                raise
            finally:
                if local_logdir:
                    if gpus_are_present_on_executors and gpu_present and job_name == 'worker' and task_index == 0:
                        # When running with GPUs
                        local_tb = tensorboard.local_logdir_path
                        hopsutil.store_local_tensorboard(
                            local_tb, hdfs_exec_logdir)

                    elif not gpus_are_present_on_executors and job_name == 'worker' and task_index == 0:
                        # When running with no GPUs
                        local_tb = tensorboard.local_logdir_path
                        hopsutil.store_local_tensorboard(
                            local_tb, hdfs_exec_logdir)

                    if devices.get_num_gpus() > 0:
                        t.do_run = False
                        t.join()

            logging.info(
                "Finished TensorFlow {0}:{1} on cluster node {2}".format(
                    job_name, task_index, executor_id))
コード例 #10
0
ファイル: dist_allreduce.py プロジェクト: Limmen/hops-util-py
    def _wrapper_fun(iter):

        for i in iter:
            executor_num = i

        client = coordination_server.Client(server_addr)

        node_meta = {
            'host': get_ip_address(),
            'executor_cwd': os.getcwd(),
            'cuda_visible_devices_ordinals':
            devices.get_minor_gpu_device_numbers()
        }

        client.register(node_meta)

        t_gpus = threading.Thread(
            target=devices.print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t_gpus.start()

        # Only spark executor with index 0 should create necessary HDFS directories and start mpirun
        # Other executors simply block until index 0 reports mpirun is finished

        clusterspec = client.await_reservations()

        #pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user())
        #hopshdfs.init_logger()
        #hopshdfs.log('Starting Spark executor with arguments')

        gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info(
        )
        #hopshdfs.log(gpu_str)
        print(gpu_str)

        mpi_logfile_path = os.getcwd() + '/mpirun.log'
        if os.path.exists(mpi_logfile_path):
            os.remove(mpi_logfile_path)

        mpi_logfile = open(mpi_logfile_path, 'w')

        py_runnable = localize_scripts(nb_path, clusterspec)

        # non-chief executor should not do mpirun
        if not executor_num == 0:
            client.await_mpirun_finished()
        else:
            hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
                app_id, run_id, param_string='Horovod')
            tb_hdfs_path, tb_pid = tensorboard.register(
                hdfs_exec_logdir, hdfs_appid_logdir, 0)

            mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \
                      ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \
                      ' mpirun -np ' + str(get_num_ps(clusterspec)) + ' --hostfile ' + get_hosts_file(clusterspec) + \
                      ' -bind-to none -map-by slot ' + \
                      ' -x LD_LIBRARY_PATH ' + \
                      ' -x HOROVOD_TIMELINE ' + \
                      ' -x TENSORBOARD_LOGDIR ' + \
                      ' -x NCCL_DEBUG=INFO ' + \
                      ' -mca pml ob1 -mca btl ^openib ' + \
                      os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable

            mpi = subprocess.Popen(mpi_cmd,
                                   shell=True,
                                   stdout=mpi_logfile,
                                   stderr=mpi_logfile,
                                   preexec_fn=util.on_executor_exit('SIGTERM'))

            t_log = threading.Thread(target=print_log)
            t_log.start()

            mpi.wait()

            client.register_mpirun_finished()

            if devices.get_num_gpus() > 0:
                t_gpus.do_run = False
                t_gpus.join()

            return_code = mpi.returncode

            if return_code != 0:
                cleanup(tb_hdfs_path)
                t_log.do_run = False
                t_log.join()
                raise Exception(
                    'mpirun FAILED, look in the logs for the error')

            cleanup(tb_hdfs_path)
            t_log.do_run = False
            t_log.join()
コード例 #11
0
ファイル: allreduce.py プロジェクト: Limmen/hops-util-py
    def _wrapper_fun(iter):

        for i in iter:
            executor_num = i

        hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
            app_id, run_id, None, 'horovod')

        tb_pid = 0
        tb_hdfs_path = ''

        pydoop.hdfs.dump('',
                         os.environ['EXEC_LOGFILE'],
                         user=hopshdfs.project_user())
        hopshdfs.init_logger()
        hopshdfs.log('Starting Spark executor with arguments')
        if executor_num == 0:
            tb_hdfs_path, tb_pid = tensorboard.register(
                hdfs_exec_logdir,
                hdfs_appid_logdir,
                0,
                local_logdir=local_logdir)

        gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info(
        )
        hopshdfs.log(gpu_str)
        print(gpu_str)

        #1. Download notebook file
        fs_handle = hopshdfs.get_fs()

        try:
            fd = fs_handle.open_file(nb_path, flags='r')
        except:
            fd = fs_handle.open_file(nb_path, mode='r')

        notebook = ''
        for line in fd:
            notebook += line

        path, filename = os.path.split(nb_path)
        f_nb = open(filename, "w+")
        f_nb.write(notebook)
        f_nb.flush()
        f_nb.close()

        # 2. Convert notebook to py file
        jupyter_runnable = os.path.abspath(
            os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter'
        conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename
        conversion = subprocess.Popen(conversion_cmd,
                                      shell=True,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
        conversion.wait()
        stdout, stderr = conversion.communicate()
        print(stdout)
        print(stderr)

        # 3. Make py file runnable
        py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py'
        st = os.stat(py_runnable)
        os.chmod(py_runnable, st.st_mode | stat.S_IEXEC)

        t_gpus = threading.Thread(
            target=devices.print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t_gpus.start()

        mpi_logfile_path = os.getcwd() + '/mpirun.log'
        if os.path.exists(mpi_logfile_path):
            os.remove(mpi_logfile_path)

        mpi_logfile = open(mpi_logfile_path, 'w')

        # 4. Run allreduce
        mpi_np = os.environ['MPI_NP']
        mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \
                  ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \
                  ' mpirun -np ' + str(mpi_np) + \
                  ' -bind-to none -map-by slot ' + \
                  ' -x HOROVOD_TIMELINE ' + \
                  ' -x TENSORBOARD_LOGDIR ' + \
                  ' -x NCCL_DEBUG=INFO ' + \
                  os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable
        mpi = subprocess.Popen(mpi_cmd,
                               shell=True,
                               stdout=mpi_logfile,
                               stderr=mpi_logfile,
                               preexec_fn=util.on_executor_exit('SIGTERM'))

        t_log = threading.Thread(target=print_log)
        t_log.start()

        mpi.wait()

        if devices.get_num_gpus() > 0:
            t_gpus.do_run = False
            t_gpus.join()

        return_code = mpi.returncode

        if local_logdir:
            local_tb = tensorboard.local_logdir_path
            pydoop.hdfs.put(local_tb, hdfs_exec_logdir)

        if return_code != 0:
            cleanup(tb_hdfs_path)
            t_log.do_run = False
            t_log.join()
            raise Exception('mpirun FAILED, look in the logs for the error')

        cleanup(tb_hdfs_path)
        t_log.do_run = False
        t_log.join()

        hopshdfs.kill_logger()
コード例 #12
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i

        tb_pid = 0
        tb_hdfs_path = ''
        hdfs_exec_logdir = ''

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        try:
            hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories(
                app_id, run_id, None, 'mirrored')
            pydoop.hdfs.dump('',
                             os.environ['EXEC_LOGFILE'],
                             user=hopshdfs.project_user())
            hopshdfs._init_logger()
            tb_hdfs_path, tb_pid = tensorboard._register(
                hdfs_exec_logdir,
                hdfs_appid_logdir,
                executor_num,
                local_logdir=local_logdir)
            gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info(
            )
            hopshdfs.log(gpu_str)
            print(gpu_str)
            print('-------------------------------------------------------')
            print('Started running task\n')
            hopshdfs.log('Started running task')
            task_start = datetime.datetime.now()
            retval = map_fun()
            task_end = datetime.datetime.now()
            if retval:
                _handle_return(retval, hdfs_exec_logdir)
            time_str = 'Finished task - took ' + util._time_diff(
                task_start, task_end)
            print('\n' + time_str)
            print('-------------------------------------------------------')
            hopshdfs.log(time_str)
        except:
            #Always do cleanup
            _cleanup(tb_hdfs_path)
            if devices.get_num_gpus() > 0:
                t.do_run = False
                t.join()
            raise
        finally:
            try:
                if local_logdir:
                    local_tb = tensorboard.local_logdir_path
                    util._store_local_tensorboard(local_tb, hdfs_exec_logdir)
            except:
                pass

        _cleanup(tb_hdfs_path)
        if devices.get_num_gpus() > 0:
            t.do_run = False
            t.join()
コード例 #13
0
    def _wrapper_fun(iter):
        """

        Args:
            iter:

        Returns:

        """

        for i in iter:
            executor_num = i


        experiment_utils._set_ml_id(app_id, run_id)

        t = threading.Thread(target=devices._print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t.start()

        role = None
        logdir = None
        tb_hdfs_path = None

        client = parameter_server_reservation.Client(server_addr)

        try:
            host = experiment_utils._get_ip_address()

            tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_socket.bind(('', 0))
            port = tmp_socket.getsockname()[1]
            host_port = host + ":" + str(port)

            exec_spec = {}
            if executor_num < num_ps:
                exec_spec["task_type"] = "ps"
            else:
                exec_spec["task_type"] = "worker"
            exec_spec["host_port"] = host_port
            exec_spec["gpus_present"] = devices.get_num_gpus() > 0

            client.register(exec_spec)

            cluster = client.await_reservations()

            tmp_socket.close()

            role, index = experiment_utils._find_task_and_index(host_port, cluster)

            cluster_spec = {}
            cluster_spec["cluster"] = cluster
            cluster_spec["task"] = {"type": role, "index": index}

            evaluator_node = None
            if evaluator:
                last_worker_index = len(cluster_spec["cluster"]["worker"])-1
                evaluator_node = cluster_spec["cluster"]["worker"][last_worker_index]
                cluster_spec["cluster"]["evaluator"] = [evaluator_node]
                del cluster_spec["cluster"]["worker"][last_worker_index]
                if evaluator_node == host_port:
                    role = "evaluator"
                    cluster_spec["task"] = {"type": "evaluator", "index": 0}

            print('TF_CONFIG: {} '.format(cluster_spec))
            os.environ["TF_CONFIG"] = json.dumps(cluster_spec)

            logfile = experiment_utils._init_logger(experiment_utils._get_logdir(app_id, run_id), role=role, index=cluster_spec["task"]["index"])

            dist_logdir = experiment_utils._get_logdir(app_id, run_id) + '/logdir'

            is_chief = (cluster["task"]["type"] == "chief")
            if is_chief:
                hdfs.mkdir(dist_logdir)
                tensorboard._register(dist_logdir, experiment_utils._get_logdir(app_id, run_id), executor_num, local_logdir=local_logdir)
            else:
                tensorboard.events_logdir = dist_logdir
                
            print(devices._get_gpu_info())
            print('-------------------------------------------------------')
            print('Started running task')
            task_start = time.time()

            retval=None
            if role == "ps":
                ps_thread = threading.Thread(target=lambda: map_fun())
                ps_thread.start()
                client.await_all_workers_finished()
            else:
                retval = map_fun()

            if role == "chief":
                experiment_utils._handle_return_simple(retval, experiment_utils._get_logdir(app_id, run_id), logfile)

            task_end = time.time()
            time_str = 'Finished task - took ' + experiment_utils._time_diff(task_start, task_end)
            print(time_str)
            print('-------------------------------------------------------')
        except:
            raise
        finally:
            if role != "ps":
                client.register_worker_finished()
            client.close()
            experiment_utils._cleanup(tensorboard, t)