def dml_exec(function_name, data, env='auto', verbose=True, **kwargs): """ Executes a distributed ml function Parameters ---------- function_name : str Name of the distributed function to be executed. The function symbol must exists in the unity distributed shared library. data : dict Key value arguments to the function stored in a dictionary env : DMLEnvironemnt Contains job environment parameters and a job submit function. **kwargs : dict Additional options. See _get_worker_args and _get_commander_args. - check_hdfs : {0, 1} Perform sanity check for hdfs read and write - startup_timeout : int Timeout in seconds for cluster setup Return ------ (success, message, result_path) : bool, str, str """ from graphlab.extensions import dml_function_invocation, init_dml_class_registry init_dml_class_registry() if env == 'auto': env = DMLRemoteEnvironment() if not file_util.exists(env.working_dir): _log.debug('Creating working directory: %s' % env.working_dir) file_util.mkdir(env.working_dir) else: _log.debug('Using existing working directory: %s' % env.working_dir) _log.info('Running distributed execution with %d workers. Working directory: %s' % (env.num_workers, env.working_dir)) success = False message = "" result_path = None # Job function arguments try: _log.info('Serializing arguments to %s' % env.working_dir) args = dml_function_invocation() data_copy = copy(data) internal_working_dir = _make_internal_url(env.working_dir) data_copy['__base_path__'] = internal_working_dir args.from_dict(data_copy, internal_working_dir) json_data = args.to_str() # sanitize the base path url sanitized_json_data = json_data if file_util.is_s3_path(json_data): sanitized_json_data = _sanitize_internal_s3_url(json_data) _log.info('Serialized arguments: %s' % sanitized_json_data) except Exception as e: success = False message = 'Error serializing arguments. %s' % str(e) return (success, message, None) # Submit job try: job = dml_submit(function_name, json_data, env, metric_server_address_file=COMMANDER_LOG_SERVER_ADDRESS_FILE, logprogress_file=PROGRESS_LOG_FILE, **kwargs) except KeyboardInterrupt: message = 'Canceled by user' return (success, message, None) _log.info('Waiting for workers to start ... ') logprinter = None if verbose: log_server_address_path = os.path.join(env.working_dir, COMMANDER_LOG_SERVER_ADDRESS_FILE) log_server_address = get_log_metric_server_address(log_server_address_path, timeout=INIT_TIMEOUT_PER_WORKER * env.num_workers) if len(log_server_address) > 0: tmp_log_dir = tempfile.mkdtemp(prefix='graphlab_dml_log_') fd_list = [] logprinter = LogPrinter() # Attach log progress stream logprinter.add_stream(LogStream(log_server_address + '/progress', os.path.join(env.working_dir, PROGRESS_LOG_FILE), sys.stdout)) # Attach commander log stream local_commander_log = open(os.path.join(tmp_log_dir, COMMANDER_LOG_FILE), 'w') fd_list.append(local_commander_log) logprinter.add_stream(LogStream(log_server_address + '/commander', os.path.join(env.working_dir, COMMANDER_LOG_FILE), local_commander_log)) # Attach worker log streams for i in range(env.num_workers): local_worker_log = open(os.path.join(tmp_log_dir, WORKER_LOG_FILE(i)), 'w') fd_list.append(local_worker_log) logprinter.add_stream(LogStream(log_server_address + '/worker%d' % i, os.path.join(env.working_dir, WORKER_LOG_FILE(i)), local_worker_log)) logprinter.start() _log.info('Success. Worker logs are avaiable at %s ' % tmp_log_dir) _log.debug('Wait for job to finish') (success, message) = _wait_and_parse_job_result(job) if logprinter: logprinter.stop() for fd in fd_list: fd.close() if success: try: result_path = os.path.join(env.working_dir, env.output_name) ret_str = file_util.read(result_path) sanitized_ret_str = _sanitize_internal_s3_url(ret_str) _log.debug('Deserializing results: %s' % sanitized_ret_str) args.from_str(ret_str) response = args.to_dict() # Check toolkit response for "result" key or "exception" key. if 'result' in response: return (success, message, response['result']) elif 'exception' in response: return (False, response['exception'], None) else: raise ValueError('Invalid toolkit response. Must have "result" or \ "exception" as key') except Exception as e: success = False message = 'Error deserializing results. %s' % str(e) return (success, message, None) else: return (success, message, None)
def dml_submit(function_name, str_data, env, **kwargs): """ Executes a distributed ml function Parameters ---------- function_name : str Name of the distributed function to be executed. The function symbol must exists in the unity distributed shared library. str_data : str Arguments as serialized string to be passed to the distributed function. env : DMLEnvironemnt Contains job environment parameters and a job submit function. **kwargs : dict Additional options. See _get_worker_args and _get_commander_args. - check_hdfs : {0, 1} Perform sanity check for hdfs read and write - startup_timeout : int Timeout in seconds for cluster setup Return ------ job : map_job """ _log.debug('Submitting job') if not file_util.exists(env.working_dir): file_util.mkdir(env.working_dir) map_job_args = _get_dml_exec_args(function_name, str_data, env, output_name=env.output_name, **kwargs) _log.debug('job arguments: %s' % str(map_job_args)) # The following code achieve the same as # """return env.submit(subprocess_exec, map_job_args)""" # but requires one less container. (Having commander code taking one entire container is wasteful) # It uses group_exec and pack the commander function and the first worker # function into one map tasks. The rest workers stay the same. # group_exec returns a list of results, so the output is a nested list of results, # we overload the job.get_results function to flatten the results. def commander_exec(): return lambda: subprocess_exe(**map_job_args[0]) def worker_exec(i): return lambda: subprocess_exe(**map_job_args[i + 1]) worker_to_function_group = [[worker_exec(i)] for i in range(env.num_workers)] worker_to_function_group[0].insert(0, commander_exec()) job = env.submit(group_exec, [{'lambdas': fgroup} for fgroup in worker_to_function_group]) # Decoreate the job get_results function to flatten the results def flatten_results(packed_results): return [item for sublist in packed_results for item in sublist] def decorate_with_flatten_results(f_original): def f_decorated(): results = f_original() return flatten_results(results) return f_decorated job.get_results = decorate_with_flatten_results(job.get_results) return job