Esempio n. 1
0
def dml_exec(function_name, data, env='auto', verbose=True, **kwargs):
    """
    Executes a distributed ml function

    Parameters
    ----------
    function_name : str
        Name of the distributed function to be executed. The function symbol
        must exists in the unity distributed shared library.

    data : dict
        Key value arguments to the function stored in a dictionary

    env : DMLEnvironemnt
        Contains job environment parameters and a job submit function.

    **kwargs : dict
        Additional options.
        See _get_worker_args and _get_commander_args.
            - check_hdfs : {0, 1} Perform sanity check for hdfs read and write
            - startup_timeout : int Timeout in seconds for cluster setup

    Return
    ------
    (success, message, result_path) : bool, str, str
    """
    from graphlab.extensions import dml_function_invocation, init_dml_class_registry
    init_dml_class_registry()

    if env == 'auto':
        env = DMLRemoteEnvironment()

    if not file_util.exists(env.working_dir):
        _log.debug('Creating working directory: %s' % env.working_dir)
        file_util.mkdir(env.working_dir)
    else:
        _log.debug('Using existing working directory: %s' % env.working_dir)

    _log.info('Running distributed execution with %d workers. Working directory: %s' % (env.num_workers, env.working_dir))

    success = False
    message = ""
    result_path = None

    # Job function arguments
    try:
        _log.info('Serializing arguments to %s' % env.working_dir)
        args = dml_function_invocation()
        data_copy = copy(data)
        internal_working_dir = _make_internal_url(env.working_dir)
        data_copy['__base_path__'] = internal_working_dir
        args.from_dict(data_copy, internal_working_dir)
        json_data = args.to_str()

        # sanitize the base path url
    
        sanitized_json_data = json_data
        if file_util.is_s3_path(json_data): 
            sanitized_json_data = _sanitize_internal_s3_url(json_data)   

        _log.info('Serialized arguments: %s' % sanitized_json_data)
    except Exception as e:
        success = False
        message = 'Error serializing arguments. %s' % str(e)
        return (success, message, None)

    # Submit job
    try:
        job = dml_submit(function_name, json_data, env,
                         metric_server_address_file=COMMANDER_LOG_SERVER_ADDRESS_FILE,
                         logprogress_file=PROGRESS_LOG_FILE,
                         **kwargs)
    except KeyboardInterrupt:
        message = 'Canceled by user'
        return (success, message, None)

    _log.info('Waiting for workers to start ... ')
    logprinter = None
    if verbose:
        log_server_address_path = os.path.join(env.working_dir,
                                               COMMANDER_LOG_SERVER_ADDRESS_FILE)
        log_server_address = get_log_metric_server_address(log_server_address_path,
                                                           timeout=INIT_TIMEOUT_PER_WORKER * env.num_workers)
        if len(log_server_address) > 0:
            tmp_log_dir = tempfile.mkdtemp(prefix='graphlab_dml_log_')
            fd_list = []
            logprinter = LogPrinter()
            # Attach log progress stream
            logprinter.add_stream(LogStream(log_server_address + '/progress',
                                            os.path.join(env.working_dir, PROGRESS_LOG_FILE),
                                            sys.stdout))
            # Attach commander log stream
            local_commander_log = open(os.path.join(tmp_log_dir, COMMANDER_LOG_FILE), 'w')
            fd_list.append(local_commander_log)
            logprinter.add_stream(LogStream(log_server_address + '/commander',
                                            os.path.join(env.working_dir, COMMANDER_LOG_FILE),
                                            local_commander_log))
            # Attach worker log streams
            for i in range(env.num_workers):
                local_worker_log = open(os.path.join(tmp_log_dir, WORKER_LOG_FILE(i)), 'w')
                fd_list.append(local_worker_log)
                logprinter.add_stream(LogStream(log_server_address + '/worker%d' % i,
                                                os.path.join(env.working_dir, WORKER_LOG_FILE(i)),
                                                local_worker_log))
            logprinter.start()
            _log.info('Success. Worker logs are avaiable at %s ' % tmp_log_dir)

    _log.debug('Wait for job to finish')
    (success, message) = _wait_and_parse_job_result(job)

    if logprinter:
        logprinter.stop()
        for fd in fd_list:
            fd.close()

    if success:
        try:
            result_path = os.path.join(env.working_dir, env.output_name)
            ret_str = file_util.read(result_path)
            sanitized_ret_str = _sanitize_internal_s3_url(ret_str)
            _log.debug('Deserializing results: %s' % sanitized_ret_str)

            args.from_str(ret_str)
            response = args.to_dict()

            # Check toolkit response for "result" key or "exception" key.
            if 'result' in response:
                return (success, message, response['result'])
            elif 'exception' in response:
                return (False, response['exception'], None)
            else:
                raise ValueError('Invalid toolkit response. Must have "result" or \
                                 "exception" as key')
        except Exception as e:
            success = False
            message = 'Error deserializing results. %s' % str(e)
            return (success, message, None)
    else:
        return (success, message, None)
Esempio n. 2
0
def dml_submit(function_name, str_data, env, **kwargs):
    """
    Executes a distributed ml function

    Parameters
    ----------
    function_name : str
        Name of the distributed function to be executed. The function symbol
        must exists in the unity distributed shared library.

    str_data : str
        Arguments as serialized string to be passed to the distributed
        function.

    env : DMLEnvironemnt
        Contains job environment parameters and a job submit function.

    **kwargs : dict
        Additional options. See _get_worker_args and _get_commander_args.
        - check_hdfs : {0, 1} Perform sanity check for hdfs read and write
        - startup_timeout : int Timeout in seconds for cluster setup

    Return
    ------
    job : map_job
    """
    _log.debug('Submitting job')

    if not file_util.exists(env.working_dir):
        file_util.mkdir(env.working_dir)

    map_job_args = _get_dml_exec_args(function_name, str_data, env,
                                      output_name=env.output_name,
                                      **kwargs)

    _log.debug('job arguments: %s' % str(map_job_args))

    # The following code achieve the same as
    # """return env.submit(subprocess_exec, map_job_args)"""
    # but requires one less container. (Having commander code taking one entire container is wasteful)

    # It uses group_exec and pack the commander function and the first worker
    # function into one map tasks. The rest workers stay the same.
    # group_exec returns a list of results, so the output is a nested list of results,
    # we overload the job.get_results function to flatten the results.

    def commander_exec():
        return lambda: subprocess_exe(**map_job_args[0])

    def worker_exec(i):
        return lambda: subprocess_exe(**map_job_args[i + 1])

    worker_to_function_group = [[worker_exec(i)] for i in range(env.num_workers)]
    worker_to_function_group[0].insert(0, commander_exec())
    job = env.submit(group_exec, [{'lambdas': fgroup} for fgroup in worker_to_function_group])

    # Decoreate the job get_results function to flatten the results
    def flatten_results(packed_results):
        return [item for sublist in packed_results for item in sublist]

    def decorate_with_flatten_results(f_original):
        def f_decorated():
            results = f_original()
            return flatten_results(results)
        return f_decorated
    job.get_results = decorate_with_flatten_results(job.get_results)
    return job