def save(self, path, aws_credentials = {}):
        """ Save predictive object to the given path

        Parameters
        ----------
        path : str
          The location to save the predictive object to
        """
        # only support saving to local or S3 for now
        if (not (fu.is_s3_path(path) or \
                 fu.is_local_path(path) or \
                 fu.is_hdfs_path(path))):
            raise RuntimeError("Only save to local and S3 path is supported, cannot \
              save predictive object to path %s. " % path)

        if fu.is_local_path(path) and os.path.exists(path):
            if os.path.exists(path):
                logging.warning("Overwriting existing file '%s' when saving predictive object" % path)
                rm_fn = os.remove if os.path.isfile(path) else shutil.rmtree
                rm_fn(path)

        if fu.is_local_path(path):
            self._save_local(path)
        else:
            self._save_remote(path, aws_credentials)

        tracker = _mt._get_metric_tracker()
        tracker.track('deploy.predictive_service.predictive_object',
            value=1,
            properties={
                'type': self.__class__.__name__
            }
        )
def _copy_predictive_object_files(source_path, target_path, is_dir, src_credentials, tgt_credentials):
    '''
    Copy either file or folder from source location to target location
    '''
    # Cleanup existing file path if exists
    if _file_util.is_local_path(target_path) and _os.path.exists(target_path):
        _shutil.rmtree(target_path)

    if _file_util.is_s3_path(source_path) and _file_util.is_s3_path(target_path):

        # compare credentials
        _check_aws_credentials(src_credentials, tgt_credentials, source_path)

        # intra s3 copy model
        _file_util.intra_s3_copy_model(source_path, target_path, is_dir, tgt_credentials)
    elif _file_util.is_local_path(source_path):

        _file_util.copy_from_local(source_path, target_path, is_dir = is_dir)

    else:
        tmp_dir = _tempfile.mkdtemp(prefix = 'copy_predictive_object')
        try:
            # download to local first
            local_path = _os.path.join(tmp_dir, 'temp_po_file')
            if _file_util.is_s3_path(source_path):
                _file_util.download_from_s3(
                    source_path,
                    local_path,
                    is_dir=is_dir,
                    aws_credentials=src_credentials,
                    silent=False)
            elif _file_util.is_hdfs_path(source_path):
                _file_util.download_from_hdfs(source_path, local_path, is_dir = False)
            else:
                raise RuntimeError('Unsupported file system type: %s' % source_path)

            # upload from local to remote
            if _file_util.is_s3_path(target_path):
                _file_util.upload_to_s3(local_path, target_path, is_dir=is_dir,
                    aws_credentials=tgt_credentials, silent=False)
            elif _file_util.is_hdfs_path(target_path):
                _file_util.hdfs_mkdir(target_path)
                _file_util.upload_to_hdfs(local_path, target_path, force=True, silent=False)
            else:
                _file_util.upload_to_local(local_path, target_path, is_dir=is_dir, silent=False)

        finally:
            _shutil.rmtree(tmp_dir)
Example #3
0
    def _finalize(self):
        '''
        When the job finishes, query the metrics and task status one last
        time, and save. The job cannot be saved after that.
        '''
        # If a job is canceled, then metrics file may not be in a valid state
        # do not try to read the metrics.
        if self._status != 'Canceled':
            self._metrics = self._get_metrics()
        else:
            self._metrics = []
            status_path = self.get_path_join_method()(self._exec_dir, 'status')
            if _file_util.is_local_path(status_path):
                try:
                    with open(status_path, 'w') as f:
                        _json.dump({'status': 'Canceled', 'start_time':None, 'end_time':None}, f)
                except Exception as e:
                    _logging.info('Exception trying to write job info')
                    _logging.info(e)
                    pass

        # Fail the job if last stage contains one single task. This could be
        # a single task job, or a map job with combiner.
        if not isinstance(self._final_stage, list) and \
                                       self._status == 'Completed' and \
                                       len(self._metrics) > 0 and \
                                       self._metrics[-1]['status'] == 'Failed':
            self._status = 'Failed'
    def _finalize(self):
        '''
        When the job finishes, query the metrics and task status one last
        time, and save. The job cannot be saved after that.
        '''
        # If a job is canceled, then metrics file may not be in a valid state
        # do not try to read the metrics.
        if self._status != 'Canceled':
            self._metrics = self._get_metrics()
        else:
            self._metrics = []
            status_path = self.get_path_join_method()(self._exec_dir, 'status')
            if _file_util.is_local_path(status_path):
                try:
                    with open(status_path, 'w') as f:
                        _json.dump({'status': 'Canceled', 'start_time':None, 'end_time':None}, f)
                except Exception as e:
                    _logging.info('Exception trying to write job info')
                    _logging.info(e)
                    pass

        # Fail the job if last stage contains one single task. This could be
        # a single task job, or a map job with combiner.
        if not isinstance(self._final_stage, list) and \
                                       self._status == 'Completed' and \
                                       len(self._metrics) > 0 and \
                                       self._metrics[-1]['status'] == 'Failed':
            self._status = 'Failed'
Example #5
0
    def __get_log_file_content(self, url, handler):
        """
        Get and return the log file content
        """
        log_file_path = str(handler.get_argument("log", None))
        job = self.__load_job()
        content = ""
        max_size = long(1048576)  # max size is 1mb
        status_code = 200
        if log_file_path:
            try:
                if _file_util.is_local_path(log_file_path):
                    if _os.path.getsize(log_file_path) > max_size:
                        raise RuntimeError(
                            "Cannot read file greater than max size.")
                    else:
                        content = self.__load_local_log_file(log_file_path)
                elif _file_util.is_s3_path(log_file_path):
                    content = _file_util.read_file_to_string_s3(
                        log_file_path, max_size,
                        job.environment.get_credentials())
                elif _file_util.is_hdfs_path(log_file_path):
                    content = _file_util.read_file_to_string_hdfs(
                        log_file_path, max_size,
                        job.environment.hadoop_conf_dir)
                else:
                    status_code = 404
                    content = "Log file path (%s) is not valid." % log_file_path
            except RuntimeError:
                status_code = 413
                content = "File size too large. Please load log file manually at %s." % log_file_path

        handler.set_status(status_code)
        handler.set_header("Content-Type", "text/plain")
        handler.write(content)
Example #6
0
    def __get_log_file_content(self, url, handler):
        """
        Get and return the log file content
        """
        log_file_path = str(handler.get_argument("log", None))
        job = self.__load_job()
        content = ""
        max_size = 1048576L # max size is 1mb
        status_code = 200
        if log_file_path:
            try:
                if _file_util.is_local_path(log_file_path):
                    if _os.path.getsize(log_file_path) > max_size:
                        raise RuntimeError("Cannot read file greater than max size.")
                    else:
                        content = self.__load_local_log_file(log_file_path)
                elif _file_util.is_s3_path(log_file_path):
                    content = _file_util.read_file_to_string_s3(log_file_path,
                                                        max_size,
                                                        job.environment.get_credentials())
                elif _file_util.is_hdfs_path(log_file_path):
                    content = _file_util.read_file_to_string_hdfs(log_file_path,
                                                        max_size,
                                                        job.environment.hadoop_conf_dir)
                else:
                    status_code = 404
                    content = "Log file path (%s) is not valid." % log_file_path
            except RuntimeError:
                status_code = 413
                content = "File size too large. Please load log file manually at %s." % log_file_path

        handler.set_status(status_code)
        handler.set_header("Content-Type", "text/plain")
        handler.write(content)
 def save(self, path, aws_credentials = None):
     '''
     Persist the policy to a certain path
     '''
     if _file_util.is_local_path(path):
         self._save_local(path)
     else:
         self._save_remote(path, aws_credentials)
    def load(cls, path, schema_version, aws_credentials={}):
        '''
        Load the policy from give path
        '''
        loaded_policy = None
        if (_file_util.is_local_path(path)):
          loaded_policy = cls._load_local(path)
        else:
          loaded_policy = cls._load_remote(path, schema_version, aws_credentials)

        return loaded_policy
    def load(cls, path, schema_version, aws_credentials={}):
        """ Load predictive object from given path
        """
        new_po = None
        if (fu.is_local_path(path)):
            new_po = cls._load_local(path)
        else:
            new_po = cls._load_remote(path, schema_version, aws_credentials)

        logging.info('Loaded predictive object "%s" successfully' % type(new_po).__name__)

        return new_po
Example #10
0
    def _load_file_and_parse(self, file_name, parser_func, silent=False, test_url=True):
        '''
        Read remote file to a local temporary file, and use parser_func
        to parse the content, returns the parsed result.

        This function is used for parsing state and progress files from
        either local, S3 or HDFS.

        If there is any exception happened, returns None
        '''
        file_is_local = _file_util.is_local_path(file_name)
        local_file_name = file_name if file_is_local else _tempfile.mktemp(prefix='job-status-')

        try:
            try:
                if test_url and not self._test_url(file_name):
                    if not silent:
                        __LOGGER__.info("File %s is not available yet." % file_name)
                    return None

                if _file_util.is_hdfs_path(file_name):

                    _file_util.download_from_hdfs(
                        hdfs_path = file_name,
                        local_path = local_file_name,
                        hadoop_conf_dir=self.environment.hadoop_conf_dir)

                elif _file_util.is_s3_path(file_name):

                    _file_util.download_from_s3(
                        s3_path = file_name,
                        local_path = local_file_name,
                        is_dir = False,
                        aws_credentials = self.environment.ec2_config.get_credentials(),
                        silent = silent)

            except Exception as e:
                # It is ok the status file is not ready yet as the job is getting prepared
                if not silent:
                    __LOGGER__.warning("Exception encountered when trying to download file from %s, error: %s" % (file_name, e))
                return None

            try:
                # parse the local file
                return parser_func(local_file_name)
            except Exception as e:
                __LOGGER__.info("Exception when parsing file %s. Error: %s" % (file_name, e))
                return None
        finally:
            if (not file_is_local) and _os.path.exists(local_file_name):
                _os.remove(local_file_name)
    def _load_file_and_parse(self, file_name, parser_func, silent=False, test_url=True):
        '''
        Read remote file to a local temporary file, and use parser_func
        to parse the content, returns the parsed result.

        This function is used for parsing state and progress files from
        either local, S3 or HDFS.

        If there is any exception happened, returns None
        '''
        file_is_local = _file_util.is_local_path(file_name)
        local_file_name = file_name if file_is_local else _tempfile.mktemp(prefix='job-status-')

        try:
            try:
                if test_url and not self._test_url(file_name):
                    if not silent:
                        __LOGGER__.info("File %s is not available yet." % file_name)
                    return None

                if _file_util.is_hdfs_path(file_name):

                    _file_util.download_from_hdfs(
                        hdfs_path = file_name,
                        local_path = local_file_name,
                        hadoop_conf_dir=self.environment.hadoop_conf_dir)

                elif _file_util.is_s3_path(file_name):

                    _file_util.download_from_s3(
                        s3_path = file_name,
                        local_path = local_file_name,
                        is_dir = False,
                        aws_credentials = self.environment.ec2_config.get_credentials(),
                        silent = silent)

            except Exception as e:
                # It is ok the status file is not ready yet as the job is getting prepared
                if not silent:
                    __LOGGER__.warning("Exception encountered when trying to download file from %s, error: %s" % (file_name, e))
                return None

            try:
                # parse the local file
                return parser_func(local_file_name)
            except Exception as e:
                __LOGGER__.info("Exception when parsing file %s. Error: %s" % (file_name, e))
                return None
        finally:
            if (not file_is_local) and _os.path.exists(local_file_name):
                _os.remove(local_file_name)
Example #12
0
    def _get_map_job_results(self,_silent=True):
        '''
        Get results of all map jobs.

        Returns
        --------
        job outputs : list
          A list of results from the job. if a certain job failed, the result would be None
        '''

        result_folder = self.get_path_join_method()(self._exec_dir, 'output')
        __LOGGER__.info("Retrieving job results from %s..." % result_folder)
        if _file_util.is_local_path(result_folder):
            local_folder = result_folder
        else:
            local_folder = self._download_remote_folder_to_local(result_folder, silent=True)

        output = []
        for t in self._stages[0]:
            try:
                task_output_file = self._task_output_paths[t]
                local_file = self.get_path_join_method()(
                    local_folder,
                    _os.path.split(task_output_file)[1])

                unpickler = gl_pickle.GLUnpickler(local_file)
                output.append(unpickler.load())
            except Exception as e:
                if not _silent:
                    __LOGGER__.warning("Ignored exception when retrieving result for task %s, error: %s" % (t.name, e))
                output.append(None)

        # Alert --cannot remove the temp result folder because the result SFrame
        # may depend on the files to exist on disk

        return output
    def _get_map_job_results(self,_silent=True):
        '''
        Get results of all map jobs.

        Returns
        --------
        job outputs : list
          A list of results from the job. if a certain job failed, the result would be None
        '''

        result_folder = self.get_path_join_method()(self._exec_dir, 'output')
        __LOGGER__.info("Retrieving job results from %s..." % result_folder)
        if _file_util.is_local_path(result_folder):
            local_folder = result_folder
        else:
            local_folder = self._download_remote_folder_to_local(result_folder, silent=True)

        output = []
        for t in self._stages[0]:
            try:
                task_output_file = self._task_output_paths[t]
                local_file = self.get_path_join_method()(
                    local_folder,
                    _os.path.split(task_output_file)[1])

                unpickler = gl_pickle.GLUnpickler(local_file)
                output.append(unpickler.load())
            except Exception as e:
                if not _silent:
                    __LOGGER__.warning("Ignored exception when retrieving result for task %s, error: %s" % (t.name, e))
                output.append(None)

        # Alert --cannot remove the temp result folder because the result SFrame
        # may depend on the files to exist on disk

        return output
def _load_imp(state_path, aws_access_key_id, aws_secret_access_key):
    '''
    Internal implmentation of the load, used by both external facing load and by
    internal facing load (gl.deploy.predictive_services[name])
    '''
    aws_credentials = None
    if _file_util.is_s3_path(state_path):
        # Save the credentials.
        if bool(aws_access_key_id) != bool(aws_secret_access_key):
            raise IOError('Either both aws_access_key_id and aws_secret_access_key ' \
                          'should be specified or neither should be specified.')
        if not aws_access_key_id and not aws_secret_access_key:
            try:
                aws_access_key_id, aws_secret_access_key = _get_credentials()
            except:
                raise IOError('No AWS credentials set. Credentials must either be ' \
                              'passed in, or set globally using ' \
                              'graphlab.aws.set_credentials(...).')
        aws_credentials = {
            'aws_access_key_id': aws_access_key_id,
            'aws_secret_access_key': aws_secret_access_key
        }

    elif (not _file_util.is_hdfs_path(state_path)) and (not _file_util.is_local_path(state_path)):
        raise ValueError("Invalid state path. Predictive Service only supports loading \
                        state path from S3, HDFS or Local file path.")

    config = _PredictiveServiceEnvironment._get_state_from_file(state_path, aws_credentials)
    name = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, 'Name')
    description = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, 'Description')
    api_key = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, 'API Key')
    admin_key = config.get(_PredictiveService._ENVIRONMENT_SECTION_NAME, 'admin_key')
    # For backwards compatibility. Port used to be hard-coded as 9005 and does not
    # exist in the config.
    if (config.has_option(_PredictiveService._ENVIRONMENT_SECTION_NAME, 'port')):
        port = int(config.get(_PredictiveService._ENVIRONMENT_SECTION_NAME, 'port'))
    else:
        port = _PORT_DEFAULT_NUM

    global_cache_state = 'enabled'
    if _CACHE_STATE_SECTION_NAME_ in config.options(_PredictiveService._SERVICE_INFO_SECTION_NAME):
        global_cache_state = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, _CACHE_STATE_SECTION_NAME_)

    cors_origin = ''
    if _CORS_ORIGIN_SECTION_NAME_ in config.options(_PredictiveService._SERVICE_INFO_SECTION_NAME):
        cors_origin = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, _CORS_ORIGIN_SECTION_NAME_)

    system_config = _SystemConfig.from_config_parser(
        config, _PredictiveService._SYSTEM_SECTION_NAME)

    result = _PredictiveService(name, state_path, description, api_key, admin_key,
                                aws_credentials,
                                _new_service=False, cors_origin=cors_origin,
                                global_cache_state=global_cache_state,
                                system_config=system_config,
                                port = port)

    # create environment
    environment_info = dict(config.items(_PredictiveService._ENVIRONMENT_SECTION_NAME))
    if aws_credentials:
        environment_info['aws_credentials'] = aws_credentials
    result._environment = _predictive_service_environment_factory(environment_info)

    # get latest state
    result._get_latest_state()

    return result