def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() try: #Arguments if args_dict: param_string, params, args = experiment_utils.build_parameters( map_fun, executor_num, args_dict) hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories( app_id, run_id, param_string, 'random_search', params=params) logfile = experiment_utils._init_logger(hdfs_exec_logdir) tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) print(devices._get_gpu_info()) print( '-------------------------------------------------------') print('Started running task ' + param_string) task_start = time.time() retval = map_fun(*args) task_end = time.time() experiment_utils._handle_return(retval, hdfs_exec_logdir, optimization_key, logfile) time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff( task_start, task_end) print(time_str) print('Returning metric ' + str(retval)) print( '-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def _wrapper_fun(iter): """ Args: :iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() global local_logdir_bool try: #Arguments if args_dict: param_string, params, args = experiment_utils.build_parameters(map_fun, executor_num, args_dict) val = _get_return_file(param_string, app_id, generation_id, run_id) hdfs_exec_logdir, hdfs_appid_logdir = experiment_utils._create_experiment_subdirectories(app_id, run_id, param_string, 'differential_evolution', sub_type='generation.' + str(generation_id), params=params) logfile = experiment_utils._init_logger(hdfs_exec_logdir) tb_hdfs_path, tb_pid = tensorboard._register(hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir_bool) print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task ' + param_string) if val is not None: val = json.loads(val) task_start = time.time() if val is None: val = map_fun(*args) task_end = time.time() time_str = 'Finished task ' + param_string + ' - took ' + experiment_utils._time_diff(task_start, task_end) print(time_str) experiment_utils._handle_return(val, hdfs_exec_logdir, opt_key, logfile) print('Returning metric ' + str(val)) print('-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_pid = 0 tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() try: #Arguments if args_dict: argcount = six.get_function_code(map_fun).co_argcount names = six.get_function_code(map_fun).co_varnames args = [] argIndex = 0 param_string = '' while argcount > 0: #Get args for executor and run function param_name = names[argIndex] param_val = args_dict[param_name][executor_num] param_string += str(param_name) + '=' + str( param_val) + '.' args.append(param_val) argcount -= 1 argIndex += 1 param_string = param_string[:-1] hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, param_string, 'launcher') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print( '-------------------------------------------------------') print('Started running task ' + param_string + '\n') hopshdfs.log('Started running task ' + param_string) task_start = datetime.datetime.now() map_fun(*args) task_end = datetime.datetime.now() time_str = 'Finished task ' + param_string + ' - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print( '-------------------------------------------------------') hopshdfs.log(time_str) else: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'launcher') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print( '-------------------------------------------------------') print('Started running task\n') hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = map_fun() task_end = datetime.datetime.now() if retval: _handle_return(retval, hdfs_exec_logdir) time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print( '-------------------------------------------------------') hopshdfs.log(time_str) except: #Always do cleanup _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: try: if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) except: pass _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def send(self): global logged_in global session if not logged_in: logger.info('Logging in to Hopsworks....') Heartbeat.login() else: system_status_to_delete = [] try: logger.debug("Creating heartbeat reply...") disk_info = DiskInfo() memory_info = MemoryInfo() load_info = LoadInfo() services_list = self.construct_services_status() now = long(time.mktime(datetime.now().timetuple())) headers = {'content-type': 'application/json'} payload = {} payload["num-gpus"] = devices.get_num_gpus() payload["host-id"] = kconfig.host_id payload["agent-time"] = now payload["services"] = services_list payload["recover"] = self._recover self._system_commands_status_mutex.acquire() system_commands_response = [] # Append command status to response for k, v in self._system_commands_status.iteritems(): system_commands_response.append(v) system_status_to_delete.append(v) # Remove status from local statuses state for command_to_delete in system_status_to_delete: del self._system_commands_status[command_to_delete['id']] self._system_commands_status_mutex.release() payload["system-commands"] = system_commands_response if (kconfig.private_ip != None): payload["private-ip"] = kconfig.private_ip else: payload["private-ip"] = "" payload["cores"] = cores payload['memory-capacity'] = memory_info.total logger.debug("Sending heartbeat...") resp = session.post(kconfig.heartbeat_url, data=json.dumps(payload), headers=headers, verify=False) logger.debug("Received heartbeat response") if not resp.status_code == HTTP_OK: # Put back deleted statuses if command ID does not exist in order to be re-send self._system_commands_status_mutex.acquire() for restore_command in system_status_to_delete: if restore_command[ 'id'] not in self._system_commands_status: self._system_commands_status[ restore_command['id']] = restore_command self._system_commands_status_mutex.release() logged_in = False raise Exception( 'Heartbeat could not be sent (Status code: {0})'. format(resp.status_code)) else: theResponse = resp.json() logger.debug( "Response from heartbeat is: {0}".format(theResponse)) self._recover = False try: system_commands = theResponse['system-commands'] for command in system_commands: c = Command('SYSTEM_COMMAND', command) logger.debug( "Adding SYSTEM command with ID {0} and status {1} to Handler Queue" .format(command['id'], command['status'])) commands_queue.put(c) command['status'] = 'ONGOING' self._system_commands_status_mutex.acquire() self._system_commands_status[ command['id']] = command self._system_commands_status_mutex.release() except Exception as err: logger.info("No commands to execute") except Exception as err: logger.error("{0}. Retrying in {1} seconds...".format( err, kconfig.heartbeat_interval)) logged_in = False
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() task_index = None try: host = util._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] client = allreduce_reservation.Client(server_addr) host_port = host + ":" + str(port) client.register({"worker": host_port, "index": executor_num}) cluster = client.await_reservations() tmp_socket.close() client.close() task_index = _find_index(host_port, cluster) cluster["task"] = {"type": "worker", "index": task_index} os.environ["TF_CONFIG"] = json.dumps(cluster) if task_index == 0: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'collective_all_reduce') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) if task_index == 0: hopshdfs.log(gpu_str) print(gpu_str) print('-------------------------------------------------------') print('Started running task \n') if task_index == 0: hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = map_fun() if task_index == 0: if retval: _handle_return(retval, hdfs_exec_logdir) task_end = datetime.datetime.now() time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print('-------------------------------------------------------') if task_index == 0: hopshdfs.log(time_str) except: #Always do cleanup _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: if task_index == 0: if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _wrapper_fun(iter): for i in iter: executor_num = i tb_pid = 0 tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() global local_logdir_bool try: #Arguments if args_dict: argcount = six.get_function_code(map_fun).co_argcount names = six.get_function_code(map_fun).co_varnames args = [] argIndex = 0 param_string = '' while argcount > 0: #Get args for executor and run function param_name = names[argIndex] param_val = args_dict[param_name][executor_num] param_string += str(param_name) + '=' + str( param_val) + '.' args.append(param_val) argcount -= 1 argIndex += 1 param_string = param_string[:-1] val = _get_metric(param_string, app_id, generation_id, run_id) hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, param_string, 'differential_evolution', sub_type='generation.' + str(generation_id)) pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir_bool) gpu_str = '\nChecking for GPUs in the environment' + devices.get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print( '-------------------------------------------------------') print('Started running task ' + param_string + '\n') if val: print('Reading returned metric from previous run: ' + str(val)) hopshdfs.log('Started running task ' + param_string) task_start = datetime.datetime.now() if not val: val = map_fun(*args) task_end = datetime.datetime.now() time_str = 'Finished task ' + param_string + ' - took ' + util.time_diff( task_start, task_end) print('\n' + time_str) hopshdfs.log(time_str) try: castval = int(val) except: raise ValueError( 'Your function needs to return a metric (number) which should be maximized or minimized' ) metric_file = hdfs_exec_logdir + '/metric' fs_handle = hopshdfs.get_fs() try: fd = fs_handle.open_file(metric_file, mode='w') except: fd = fs_handle.open_file(metric_file, flags='w') fd.write(str(float(val)).encode()) fd.flush() fd.close() print('Returning metric ' + str(val)) print( '-------------------------------------------------------') except: #Always do cleanup if tb_hdfs_path: _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: if local_logdir_bool: local_tb = tensorboard.local_logdir_path util.store_local_tensorboard(local_tb, hdfs_exec_logdir) hopshdfs.log('Finished running') if tb_hdfs_path: _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() is_chief = False logdir = None tb_hdfs_path = None try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] client = allreduce_reservation.Client(server_addr) host_port = host + ":" + str(port) client.register({"worker": host_port, "index": executor_num}) cluster = client.await_reservations() tmp_socket.close() client.close() task_index = experiment_utils._find_index(host_port, cluster) if task_index == -1: cluster["task"] = {"type": "chief", "index": 0} else: cluster["task"] = {"type": "worker", "index": task_index} evaluator_node = None if evaluator: last_worker_index = len(cluster["cluster"]["worker"]) - 1 evaluator_node = cluster["cluster"]["worker"][ last_worker_index] cluster["cluster"]["evaluator"] = [evaluator_node] del cluster["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: cluster["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster)) if num_executors > 1: os.environ["TF_CONFIG"] = json.dumps(cluster) is_chief = (cluster["task"]["type"] == "chief") is_evaluator = (cluster["task"]["type"] == "evaluator") if is_chief: logdir = experiment_utils._get_logdir(app_id, run_id) tb_hdfs_path, tb_pid = tensorboard._register( logdir, logdir, executor_num, local_logdir=local_logdir) elif is_evaluator: logdir = experiment_utils._get_logdir(app_id, run_id) tensorboard.events_logdir = logdir logfile = experiment_utils._init_logger( experiment_utils._get_logdir(app_id, run_id), role=cluster["task"]["type"], index=cluster["task"]["index"]) print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval = map_fun() if is_chief: experiment_utils._handle_return_simple( retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff( task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: experiment_utils._cleanup(tensorboard, t)
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() role = None client = parameter_server_reservation.Client(server_addr) try: host = util._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] host_port = host + ":" + str(port) exec_spec = {} if executor_num < num_ps: exec_spec["task_type"] = "ps" else: exec_spec["task_type"] = "worker" exec_spec["host_port"] = host_port exec_spec["gpus_present"] = devices.get_num_gpus() > 0 client.register(exec_spec) cluster = client.await_reservations() tmp_socket.close() role, index = _find_task_and_index(host_port, cluster) cluster_spec = {} cluster_spec["cluster"] = cluster cluster_spec["task"] = {"type": role, "index": index} print(cluster_spec) os.environ["TF_CONFIG"] = json.dumps(cluster_spec) if role == "chief": hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'parameter_server') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) if role == "chief": hopshdfs.log(gpu_str) print(gpu_str) print('-------------------------------------------------------') print('Started running task \n') if role == "chief": hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = None if role == "ps": ps_thread = threading.Thread(target=lambda: map_fun()) ps_thread.start() print("waiting for workers") client.await_all_workers_finished() print("waiting finished") else: retval = map_fun() if role == "chief": if retval: _handle_return(retval, hdfs_exec_logdir) task_end = datetime.datetime.now() time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print('-------------------------------------------------------') if role == "chief": hopshdfs.log(time_str) except: _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: if role == "chief": if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) try: if role == "worker" or role == "chief": client.register_worker_finished() client.close() except: pass _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _mapfn(iter): # Note: consuming the input iterator helps Pyspark re-use this worker, for i in iter: executor_id = i # assign TF job/task based on provided cluster_spec template (or use default/null values) job_name = 'default' task_index = -1 cluster_id = cluster_meta['id'] cluster_template = cluster_meta['cluster_template'] for jobtype in cluster_template: nodes = cluster_template[jobtype] if executor_id in nodes: job_name = jobtype task_index = nodes.index(executor_id) break # get unique key (hostname, executor_id) for this executor host = util.get_ip_address() util.write_executor_id(executor_id) port = 0 # check for existing TFManagers if TFSparkNode.mgr is not None and str( TFSparkNode.mgr.get('state')) != "'stopped'": if TFSparkNode.cluster_id == cluster_id: # raise an exception to force Spark to retry this "reservation" task on another executor raise Exception( "TFManager already started on {0}, executor={1}, state={2}" .format(host, executor_id, str(TFSparkNode.mgr.get("state")))) else: # old state, just continue with creating new manager logging.warn( "Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}" .format(TFSparkNode.cluster_id, cluster_id)) gpu_present = gpu_info.detect_gpu_present() client = reservation.Client(cluster_meta['server_addr']) logging.info("TFSparkNode.run register: {0}".format(gpu_present)) client.register_gpu_presence(gpu_present) gpus_are_present_on_executors = client.await_gpu_check() logging.info("TFSparkNode.run await_gpu_check: {0}".format( gpus_are_present_on_executors)) # check for existing TFManagers if TFSparkNode.mgr is not None and str( TFSparkNode.mgr.get('state')) != "'stopped'": if TFSparkNode.cluster_id == cluster_id: # raise an exception to force Spark to retry this "reservation" task on another executor raise Exception( "TFManager already started on {0}, state={1}".format( host, str(TFSparkNode.mgr.get("state")))) else: # old state, just continue with creating new manager logging.warn( "Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}" .format(TFSparkNode.cluster_id, cluster_id)) # start a TFManager and get a free port # use a random uuid as the authkey authkey = uuid.uuid4().bytes addr = None if (gpus_are_present_on_executors): #Valid PS, does not have GPUs, will be started as a PS if job_name == 'ps' and gpu_present == False: # PS nodes must be remotely accessible in order to shutdown from Spark driver. TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'], 'remote') addr = (host, TFSparkNode.mgr.address[1]) #Invalid worker, all workers should have GPUs, this one will assume role as PS elif job_name == 'worker' and gpu_present == False: # PS nodes must be remotely accessible in order to shutdown from Spark driver. TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'], 'remote') addr = (host, TFSparkNode.mgr.address[1]) #Correct worker else: # worker nodes only need to be locally accessible within the executor for data feeding TFSparkNode.mgr = TFManager.start(authkey, queues) addr = TFSparkNode.mgr.address else: if job_name == 'ps': # PS nodes must be remotely accessible in order to shutdown from Spark driver. TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'], 'remote') addr = (host, TFSparkNode.mgr.address[1]) else: # worker nodes only need to be locally accessible within the executor for data feeding TFSparkNode.mgr = TFManager.start(authkey, queues) addr = TFSparkNode.mgr.address # initialize mgr state TFSparkNode.mgr.set('state', 'running') TFSparkNode.cluster_id = cluster_id # expand Hadoop classpath wildcards for JNI (Spark 2.x) if 'HADOOP_PREFIX' in os.environ: classpath = os.environ['CLASSPATH'] hadoop_path = os.path.join(os.environ['HADOOP_PREFIX'], 'bin', 'hadoop') hadoop_classpath = subprocess.check_output( [hadoop_path, 'classpath', '--glob']).decode() logging.debug("CLASSPATH: {0}".format(hadoop_classpath)) os.environ['CLASSPATH'] = classpath + os.pathsep + hadoop_classpath # start TensorBoard if requested tb_pid = 0 tb_port = 0 # check server to see if this task is being retried (i.e. already reserved) client = reservation.Client(cluster_meta['server_addr']) cluster_info = client.get_reservations() tmp_sock = None node_meta = None for node in cluster_info: (nhost, nexec) = (node['host'], node['executor_id']) if nhost == host and nexec == executor_id: node_meta = node port = node['port'] # if not already done, register everything we need to set up the cluster if node_meta is None: # first, find a free port for TF tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) tmp_sock.bind(('', port)) port = tmp_sock.getsockname()[1] node_meta = { 'executor_id': executor_id, 'host': host, 'job_name': job_name, 'task_index': task_index, 'port': port, 'tb_pid': tb_pid, 'tb_port': tb_port, 'addr': addr, 'authkey': authkey, 'gpu_present': gpu_present } # register node metadata with server logging.info("TFSparkNode.run register: {0}".format(node_meta)) client.register(node_meta) # wait for other nodes to finish reservations cluster_info = client.await_reservations() logging.info( "TFSparkNode.run await_reservations: {0}".format(cluster_info)) client.close() # construct a TensorFlow clusterspec from cluster_info sorted_cluster_info = sorted(cluster_info, key=lambda k: k['executor_id']) spec = {} last_executor_id = -1 for node in sorted_cluster_info: if (node['executor_id'] == last_executor_id): raise Exception("Duplicate worker/task in cluster_info") last_executor_id = node['executor_id'] logging.info("node: {0}".format(node)) (njob, nhost, nport) = (node['job_name'], node['host'], node['port']) hosts = [] if njob not in spec else spec[njob] hosts.append("{0}:{1}".format(nhost, nport)) spec[njob] = hosts for node in cluster_info: if ((node_meta['host'] == node['host']) and (node_meta['authkey'] == node['authkey'])): job_name = node['job_name'] task_index = node['task_index'] executor_id = node['executor_id'] break hdfs_exec_logdir = '' if gpus_are_present_on_executors and gpu_present and job_name == 'worker' and task_index == 0: # When running with GPUs hdfs_exec_logdir, hdfs_appid_logdir = hdfs.create_directories( app_id, run_id, None, 'tensorflowonspark') tb_proc = tensorboard.register(hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) elif not gpus_are_present_on_executors and job_name == 'worker' and task_index == 0: # When running with no GPUs hdfs_exec_logdir, hdfs_appid_logdir = hdfs.create_directories( app_id, run_id, None, 'tensorflowonspark') tb_proc = tensorboard.register(hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) # construct a TensorFlow clusterspec from cluster_info sorted_cluster_info = sorted(cluster_info, key=lambda k: k['executor_id']) spec = {} for node in sorted_cluster_info: logging.info("node: {0}".format(node)) (njob, nhost, nport) = (node['job_name'], node['host'], node['port']) hosts = [] if njob not in spec else spec[njob] hosts.append("{0}:{1}".format(nhost, nport)) spec[njob] = hosts # update TF_CONFIG and reserve GPU for tf.estimator based code # Note: this will execute but be ignored by non-tf.estimator code tf_config = json.dumps({ 'cluster': spec, 'task': { 'type': job_name, 'index': task_index }, 'environment': 'cloud' }) os.environ['TF_CONFIG'] = tf_config # create a context object to hold metadata for TF ctx = TFNodeContext(executor_id, job_name, task_index, spec, cluster_meta['default_fs'], cluster_meta['working_dir'], TFSparkNode.mgr) # release port reserved for TF as late as possible if tmp_sock is not None: tmp_sock.close() # Background mode relies reuse of python worker in Spark. if background: # However, reuse of python worker can't work on Windows, we need to check if the current # script runs on Windows or not. if os.name == 'nt' or platform.system() == 'Windows': raise Exception("Background mode is not supported on Windows.") # Check if the config of reuse python worker is enabled on Spark. if not os.environ.get("SPARK_REUSE_WORKER"): raise Exception( "Background mode relies reuse of python worker on Spark. This config 'spark.python.worker.reuse' is not enabled on Spark. Please enable it before using background." ) def wrapper_fn(args, context): """Wrapper function that sets the sys.argv of the executor.""" if isinstance(args, list): sys.argv = args fn(args, context) def wrapper_fn_background(args, context): """Wrapper function that signals exceptions to foreground process.""" errq = TFSparkNode.mgr.get_queue('error') try: wrapper_fn(args, context) except Exception: errq.put(traceback.format_exc()) errq.join() if job_name == 'ps' or background: # invoke the TensorFlow main function in a background thread logging.info( "Starting TensorFlow {0}:{1} as {2} on cluster node {3} on background process" .format(job_name, task_index, job_name, executor_id)) p = multiprocessing.Process(target=wrapper_fn_background, args=(tf_args, ctx)) if job_name == 'ps': p.daemon = True p.start() # for ps nodes only, wait indefinitely in foreground thread for a "control" event (None == "stop") if job_name == 'ps': queue = TFSparkNode.mgr.get_queue('control') equeue = TFSparkNode.mgr.get_queue('error') done = False while not done: while (queue.empty() and equeue.empty()): time.sleep(1) if (not equeue.empty()): e_str = equeue.get() equeue.task_done() raise Exception("exception in ps:\n" + e_str) msg = queue.get(block=True) logging.info("Got msg: {0}".format(msg)) if msg == None: logging.info("Terminating PS") TFSparkNode.mgr.set('state', 'stopped') done = True queue.task_done() else: t = threading.Thread(target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() # otherwise, just run TF function in the main executor/worker thread logging.info( "Starting TensorFlow {0}:{1} on cluster node {2} on foreground thread" .format(job_name, task_index, executor_id)) try: wrapper_fn(tf_args, ctx) except: raise finally: if local_logdir: if gpus_are_present_on_executors and gpu_present and job_name == 'worker' and task_index == 0: # When running with GPUs local_tb = tensorboard.local_logdir_path hopsutil.store_local_tensorboard( local_tb, hdfs_exec_logdir) elif not gpus_are_present_on_executors and job_name == 'worker' and task_index == 0: # When running with no GPUs local_tb = tensorboard.local_logdir_path hopsutil.store_local_tensorboard( local_tb, hdfs_exec_logdir) if devices.get_num_gpus() > 0: t.do_run = False t.join() logging.info( "Finished TensorFlow {0}:{1} on cluster node {2}".format( job_name, task_index, executor_id))
def _wrapper_fun(iter): for i in iter: executor_num = i client = coordination_server.Client(server_addr) node_meta = { 'host': get_ip_address(), 'executor_cwd': os.getcwd(), 'cuda_visible_devices_ordinals': devices.get_minor_gpu_device_numbers() } client.register(node_meta) t_gpus = threading.Thread( target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t_gpus.start() # Only spark executor with index 0 should create necessary HDFS directories and start mpirun # Other executors simply block until index 0 reports mpirun is finished clusterspec = client.await_reservations() #pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) #hopshdfs.init_logger() #hopshdfs.log('Starting Spark executor with arguments') gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info( ) #hopshdfs.log(gpu_str) print(gpu_str) mpi_logfile_path = os.getcwd() + '/mpirun.log' if os.path.exists(mpi_logfile_path): os.remove(mpi_logfile_path) mpi_logfile = open(mpi_logfile_path, 'w') py_runnable = localize_scripts(nb_path, clusterspec) # non-chief executor should not do mpirun if not executor_num == 0: client.await_mpirun_finished() else: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, param_string='Horovod') tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0) mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \ ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \ ' mpirun -np ' + str(get_num_ps(clusterspec)) + ' --hostfile ' + get_hosts_file(clusterspec) + \ ' -bind-to none -map-by slot ' + \ ' -x LD_LIBRARY_PATH ' + \ ' -x HOROVOD_TIMELINE ' + \ ' -x TENSORBOARD_LOGDIR ' + \ ' -x NCCL_DEBUG=INFO ' + \ ' -mca pml ob1 -mca btl ^openib ' + \ os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable mpi = subprocess.Popen(mpi_cmd, shell=True, stdout=mpi_logfile, stderr=mpi_logfile, preexec_fn=util.on_executor_exit('SIGTERM')) t_log = threading.Thread(target=print_log) t_log.start() mpi.wait() client.register_mpirun_finished() if devices.get_num_gpus() > 0: t_gpus.do_run = False t_gpus.join() return_code = mpi.returncode if return_code != 0: cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() raise Exception( 'mpirun FAILED, look in the logs for the error') cleanup(tb_hdfs_path) t_log.do_run = False t_log.join()
def _wrapper_fun(iter): for i in iter: executor_num = i hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, None, 'horovod') tb_pid = 0 tb_hdfs_path = '' pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() hopshdfs.log('Starting Spark executor with arguments') if executor_num == 0: tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) #1. Download notebook file fs_handle = hopshdfs.get_fs() try: fd = fs_handle.open_file(nb_path, flags='r') except: fd = fs_handle.open_file(nb_path, mode='r') notebook = '' for line in fd: notebook += line path, filename = os.path.split(nb_path) f_nb = open(filename, "w+") f_nb.write(notebook) f_nb.flush() f_nb.close() # 2. Convert notebook to py file jupyter_runnable = os.path.abspath( os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter' conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename conversion = subprocess.Popen(conversion_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) conversion.wait() stdout, stderr = conversion.communicate() print(stdout) print(stderr) # 3. Make py file runnable py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py' st = os.stat(py_runnable) os.chmod(py_runnable, st.st_mode | stat.S_IEXEC) t_gpus = threading.Thread( target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t_gpus.start() mpi_logfile_path = os.getcwd() + '/mpirun.log' if os.path.exists(mpi_logfile_path): os.remove(mpi_logfile_path) mpi_logfile = open(mpi_logfile_path, 'w') # 4. Run allreduce mpi_np = os.environ['MPI_NP'] mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \ ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \ ' mpirun -np ' + str(mpi_np) + \ ' -bind-to none -map-by slot ' + \ ' -x HOROVOD_TIMELINE ' + \ ' -x TENSORBOARD_LOGDIR ' + \ ' -x NCCL_DEBUG=INFO ' + \ os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable mpi = subprocess.Popen(mpi_cmd, shell=True, stdout=mpi_logfile, stderr=mpi_logfile, preexec_fn=util.on_executor_exit('SIGTERM')) t_log = threading.Thread(target=print_log) t_log.start() mpi.wait() if devices.get_num_gpus() > 0: t_gpus.do_run = False t_gpus.join() return_code = mpi.returncode if local_logdir: local_tb = tensorboard.local_logdir_path pydoop.hdfs.put(local_tb, hdfs_exec_logdir) if return_code != 0: cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() raise Exception('mpirun FAILED, look in the logs for the error') cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() hopshdfs.kill_logger()
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i tb_pid = 0 tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() try: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs._create_directories( app_id, run_id, None, 'mirrored') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs._init_logger() tb_hdfs_path, tb_pid = tensorboard._register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices._get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print('-------------------------------------------------------') print('Started running task\n') hopshdfs.log('Started running task') task_start = datetime.datetime.now() retval = map_fun() task_end = datetime.datetime.now() if retval: _handle_return(retval, hdfs_exec_logdir) time_str = 'Finished task - took ' + util._time_diff( task_start, task_end) print('\n' + time_str) print('-------------------------------------------------------') hopshdfs.log(time_str) except: #Always do cleanup _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: try: if local_logdir: local_tb = tensorboard.local_logdir_path util._store_local_tensorboard(local_tb, hdfs_exec_logdir) except: pass _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _wrapper_fun(iter): """ Args: iter: Returns: """ for i in iter: executor_num = i experiment_utils._set_ml_id(app_id, run_id) t = threading.Thread(target=devices._print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() role = None logdir = None tb_hdfs_path = None client = parameter_server_reservation.Client(server_addr) try: host = experiment_utils._get_ip_address() tmp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_socket.bind(('', 0)) port = tmp_socket.getsockname()[1] host_port = host + ":" + str(port) exec_spec = {} if executor_num < num_ps: exec_spec["task_type"] = "ps" else: exec_spec["task_type"] = "worker" exec_spec["host_port"] = host_port exec_spec["gpus_present"] = devices.get_num_gpus() > 0 client.register(exec_spec) cluster = client.await_reservations() tmp_socket.close() role, index = experiment_utils._find_task_and_index(host_port, cluster) cluster_spec = {} cluster_spec["cluster"] = cluster cluster_spec["task"] = {"type": role, "index": index} evaluator_node = None if evaluator: last_worker_index = len(cluster_spec["cluster"]["worker"])-1 evaluator_node = cluster_spec["cluster"]["worker"][last_worker_index] cluster_spec["cluster"]["evaluator"] = [evaluator_node] del cluster_spec["cluster"]["worker"][last_worker_index] if evaluator_node == host_port: role = "evaluator" cluster_spec["task"] = {"type": "evaluator", "index": 0} print('TF_CONFIG: {} '.format(cluster_spec)) os.environ["TF_CONFIG"] = json.dumps(cluster_spec) logfile = experiment_utils._init_logger(experiment_utils._get_logdir(app_id, run_id), role=role, index=cluster_spec["task"]["index"]) dist_logdir = experiment_utils._get_logdir(app_id, run_id) + '/logdir' is_chief = (cluster["task"]["type"] == "chief") if is_chief: hdfs.mkdir(dist_logdir) tensorboard._register(dist_logdir, experiment_utils._get_logdir(app_id, run_id), executor_num, local_logdir=local_logdir) else: tensorboard.events_logdir = dist_logdir print(devices._get_gpu_info()) print('-------------------------------------------------------') print('Started running task') task_start = time.time() retval=None if role == "ps": ps_thread = threading.Thread(target=lambda: map_fun()) ps_thread.start() client.await_all_workers_finished() else: retval = map_fun() if role == "chief": experiment_utils._handle_return_simple(retval, experiment_utils._get_logdir(app_id, run_id), logfile) task_end = time.time() time_str = 'Finished task - took ' + experiment_utils._time_diff(task_start, task_end) print(time_str) print('-------------------------------------------------------') except: raise finally: if role != "ps": client.register_worker_finished() client.close() experiment_utils._cleanup(tensorboard, t)