コード例 #1
0
 def _get_progress_from_file(self):
     log_location = '/'.join([self._working_dir, 'progress.log'])
     if _file_util.exists(log_location):
         content = _file_util.read(log_location)
         if content:
             return content[:-1]
     else:
         __LOGGER__.warn('Progress log file cannot be found')
     return ""
コード例 #2
0
    def get_results(self):
        """
        Retrieve the result from this job.

        This is a BLOCKING function. It will block until the job is completed
        and a usable result can be returned (usually a trained model). Raises
        an RuntimeError if job fails for any reason, or if the job succeeds but
        the result cannot be returned.

        Returns
        -------
        out : many types
        """
        if self._result is not None:
            return self._result

        (retcode, output) = self._job_handle.wait()
        final_state_ready = False
        while not final_state_ready:
            try:
                state = self.get_final_state()
                final_state_ready = True
            except RuntimeError:
                pass
            time.sleep(1)

        if state == JobHandle.FINAL_STATE.SUCCESS:
            result_location = '/'.join([self._working_dir, 'out'])
            ret_str = _file_util.read(result_location)
            if ret_str is None:
                raise RuntimeError("Failed to read from expected result location: " + str(result_location))
            self._dml_serdes.from_str(ret_str)
            result_dict = self._dml_serdes.to_dict()
            if 'exception' in result_dict:
                __LOGGER__.debug("Found exception in DML result. Result: " + str(result_dict))
                raise ToolkitError(result_dict['exception'])
            if 'result' not in result_dict:
                __LOGGER__.debug("Model not found in DML result. Result: " + str(result_dict))
                raise RuntimeError("Model not found, though job completed successfully!")
            dml_result = result_dict['result']
            result_ctor = _supported_result_objects[self._algo_name]

            self._result = result_ctor(dml_result)
            return self._result
        else:
            self._failed_client_output = output
            __LOGGER__.debug("Client failed. Return code: " + str(retcode) +\
                    "\nSTDOUT:\n" + output[0] + \
                    "\nSTDERR:\n" + output[1])
            __LOGGER__.debug("Current state: " + str(self.get_state()))
            raise RuntimeError("Job failed with final state " + str(state) +\
                    ". Use the status_summary method to diagnose the issue.")
コード例 #3
0
def _dml_read_app_metric_server(working_dir):
    if sys.version_info.major == 2:
        from urllib2 import urlopen
    else:
        from urllib.request import urlopen

    metric_server_address = os.path.join(working_dir, 'metric_server_address')
    url = file_util.read(metric_server_address)
    if not url or url[-1] != '$':
        return ''
    else:
        url = url[:-1] + '/progress'
        logger.info('Open url %s' % url)
        return urlopen(url).read()
コード例 #4
0
def get_log_metric_server_address(log_server_address_file, timeout=120):
    starttime = time.time()
    try:
        while(not file_util.exists(log_server_address_file)):
            time.sleep(.05)
            if (time.time() - starttime) > timeout:
                __logger__.warning('Unable to get server log (timeout reached)')
                return ""
        ret_str = file_util.read(log_server_address_file)
        if ret_str.endswith('$'):
            return ret_str[:-1]
    except Exception as e:
        __logger__.warning(e)
    return ""
コード例 #5
0
def get_log_metric_server_address(log_server_address_file, timeout=120):
    starttime = time.time()
    try:
        while (not file_util.exists(log_server_address_file)):
            time.sleep(.05)
            if (time.time() - starttime) > timeout:
                __logger__.warning(
                    'Unable to get server log (timeout reached)')
                return ""
        ret_str = file_util.read(log_server_address_file)
        if ret_str.endswith('$'):
            return ret_str[:-1]
    except Exception as e:
        __logger__.warning(e)
    return ""
コード例 #6
0
    def receive_from_file(self):
        try:
            if file_util.exists(self.file_url):
                __logger__.debug("Read from %s" % self.file_url)
                content = file_util.read(self.file_url)
                leftover_progress_content = content[len(self.total_received_message):]
                # Final log file incomplete
                if not leftover_progress_content.endswith('$'):
                    return False

                if len(leftover_progress_content):
                    self.out.write(leftover_progress_content[:-1])  # ignore $
                    self.out.flush()
                self.total_received_message += leftover_progress_content
                return True
        except Exception as e:
            __logger__.warning(e)
        return False
コード例 #7
0
    def receive_from_file(self):
        try:
            if file_util.exists(self.file_url):
                __logger__.debug("Read from %s" % self.file_url)
                content = file_util.read(self.file_url)
                leftover_progress_content = content[
                    len(self.total_received_message):]
                # Final log file incomplete
                if not leftover_progress_content.endswith('$'):
                    return False

                if len(leftover_progress_content):
                    self.out.write(leftover_progress_content[:-1])  # ignore $
                    self.out.flush()
                self.total_received_message += leftover_progress_content
                return True
        except Exception as e:
            __logger__.warning(e)
        return False
コード例 #8
0
 def _get_job_log_server_address(self, timeout=10):
     if self._log_server_address:
         return self._log_server_address
     log_server_address_file = '/'.join([self._working_dir, 'metric_server_address'])
     starttime = time.time()
     timeout = False
     __LOGGER__.info('Waiting for log server address to be available')
     while(not _file_util.exists(log_server_address_file)):
         time.sleep(1)
         if (time.time() - starttime) > timeout:
             __LOGGER__.info('Timeout waiting for log server address')
             timeout = True
             break
     if not timeout:
         ret_str = _file_util.read(log_server_address_file)
         if ret_str.endswith('$'):
             self._log_server_address = ret_str[:-1] + "/progress"
             __LOGGER__.info('Log server address: %s' % self._log_server_address)
             return self._log_server_address
     return ""
コード例 #9
0
def _dml_read_app_progress_file(working_dir):
    progress_file = os.path.join(working_dir, 'progress.log')
    lines = file_util.read(progress_file)
    return lines
コード例 #10
0
def dml_exec(function_name, data, env='auto', verbose=True, **kwargs):
    """
    Executes a distributed ml function

    Parameters
    ----------
    function_name : str
        Name of the distributed function to be executed. The function symbol
        must exists in the unity distributed shared library.

    data : dict
        Key value arguments to the function stored in a dictionary

    env : DMLEnvironemnt
        Contains job environment parameters and a job submit function.

    **kwargs : dict
        Additional options.
        See _get_worker_args and _get_commander_args.
            - check_hdfs : {0, 1} Perform sanity check for hdfs read and write
            - startup_timeout : int Timeout in seconds for cluster setup

    Return
    ------
    (success, message, result_path) : bool, str, str
    """
    from graphlab.extensions import dml_function_invocation, init_dml_class_registry
    init_dml_class_registry()

    if env == 'auto':
        env = DMLRemoteEnvironment()

    if not file_util.exists(env.working_dir):
        _log.debug('Creating working directory: %s' % env.working_dir)
        file_util.mkdir(env.working_dir)
    else:
        _log.debug('Using existing working directory: %s' % env.working_dir)

    _log.info('Running distributed execution with %d workers. Working directory: %s' % (env.num_workers, env.working_dir))

    success = False
    message = ""
    result_path = None

    # Job function arguments
    try:
        _log.info('Serializing arguments to %s' % env.working_dir)
        args = dml_function_invocation()
        data_copy = copy(data)
        internal_working_dir = _make_internal_url(env.working_dir)
        data_copy['__base_path__'] = internal_working_dir
        args.from_dict(data_copy, internal_working_dir)
        json_data = args.to_str()

        # sanitize the base path url
    
        sanitized_json_data = json_data
        if file_util.is_s3_path(json_data): 
            sanitized_json_data = _sanitize_internal_s3_url(json_data)   

        _log.info('Serialized arguments: %s' % sanitized_json_data)
    except Exception as e:
        success = False
        message = 'Error serializing arguments. %s' % str(e)
        return (success, message, None)

    # Submit job
    try:
        job = dml_submit(function_name, json_data, env,
                         metric_server_address_file=COMMANDER_LOG_SERVER_ADDRESS_FILE,
                         logprogress_file=PROGRESS_LOG_FILE,
                         **kwargs)
    except KeyboardInterrupt:
        message = 'Canceled by user'
        return (success, message, None)

    _log.info('Waiting for workers to start ... ')
    logprinter = None
    if verbose:
        log_server_address_path = os.path.join(env.working_dir,
                                               COMMANDER_LOG_SERVER_ADDRESS_FILE)
        log_server_address = get_log_metric_server_address(log_server_address_path,
                                                           timeout=INIT_TIMEOUT_PER_WORKER * env.num_workers)
        if len(log_server_address) > 0:
            tmp_log_dir = tempfile.mkdtemp(prefix='graphlab_dml_log_')
            fd_list = []
            logprinter = LogPrinter()
            # Attach log progress stream
            logprinter.add_stream(LogStream(log_server_address + '/progress',
                                            os.path.join(env.working_dir, PROGRESS_LOG_FILE),
                                            sys.stdout))
            # Attach commander log stream
            local_commander_log = open(os.path.join(tmp_log_dir, COMMANDER_LOG_FILE), 'w')
            fd_list.append(local_commander_log)
            logprinter.add_stream(LogStream(log_server_address + '/commander',
                                            os.path.join(env.working_dir, COMMANDER_LOG_FILE),
                                            local_commander_log))
            # Attach worker log streams
            for i in range(env.num_workers):
                local_worker_log = open(os.path.join(tmp_log_dir, WORKER_LOG_FILE(i)), 'w')
                fd_list.append(local_worker_log)
                logprinter.add_stream(LogStream(log_server_address + '/worker%d' % i,
                                                os.path.join(env.working_dir, WORKER_LOG_FILE(i)),
                                                local_worker_log))
            logprinter.start()
            _log.info('Success. Worker logs are avaiable at %s ' % tmp_log_dir)

    _log.debug('Wait for job to finish')
    (success, message) = _wait_and_parse_job_result(job)

    if logprinter:
        logprinter.stop()
        for fd in fd_list:
            fd.close()

    if success:
        try:
            result_path = os.path.join(env.working_dir, env.output_name)
            ret_str = file_util.read(result_path)
            sanitized_ret_str = _sanitize_internal_s3_url(ret_str)
            _log.debug('Deserializing results: %s' % sanitized_ret_str)

            args.from_str(ret_str)
            response = args.to_dict()

            # Check toolkit response for "result" key or "exception" key.
            if 'result' in response:
                return (success, message, response['result'])
            elif 'exception' in response:
                return (False, response['exception'], None)
            else:
                raise ValueError('Invalid toolkit response. Must have "result" or \
                                 "exception" as key')
        except Exception as e:
            success = False
            message = 'Error deserializing results. %s' % str(e)
            return (success, message, None)
    else:
        return (success, message, None)