def _copy_predictive_object_files(source_path, target_path, is_dir, src_credentials, tgt_credentials):
    '''
    Copy either file or folder from source location to target location
    '''
    # Cleanup existing file path if exists
    if _file_util.is_local_path(target_path) and _os.path.exists(target_path):
        _shutil.rmtree(target_path)

    if _file_util.is_s3_path(source_path) and _file_util.is_s3_path(target_path):

        # compare credentials
        _check_aws_credentials(src_credentials, tgt_credentials, source_path)

        # intra s3 copy model
        _file_util.intra_s3_copy_model(source_path, target_path, is_dir, tgt_credentials)
    elif _file_util.is_local_path(source_path):

        _file_util.copy_from_local(source_path, target_path, is_dir = is_dir)

    else:
        tmp_dir = _tempfile.mkdtemp(prefix = 'copy_predictive_object')
        try:
            # download to local first
            local_path = _os.path.join(tmp_dir, 'temp_po_file')
            if _file_util.is_s3_path(source_path):
                _file_util.download_from_s3(
                    source_path,
                    local_path,
                    is_dir=is_dir,
                    aws_credentials=src_credentials,
                    silent=False)
            elif _file_util.is_hdfs_path(source_path):
                _file_util.download_from_hdfs(source_path, local_path, is_dir = False)
            else:
                raise RuntimeError('Unsupported file system type: %s' % source_path)

            # upload from local to remote
            if _file_util.is_s3_path(target_path):
                _file_util.upload_to_s3(local_path, target_path, is_dir=is_dir,
                    aws_credentials=tgt_credentials, silent=False)
            elif _file_util.is_hdfs_path(target_path):
                _file_util.hdfs_mkdir(target_path)
                _file_util.upload_to_hdfs(local_path, target_path, force=True, silent=False)
            else:
                _file_util.upload_to_local(local_path, target_path, is_dir=is_dir, silent=False)

        finally:
            _shutil.rmtree(tmp_dir)
Ejemplo n.º 2
0
 def _test_url(self,file_path):
     if _file_util.is_hdfs_path(file_path):
         return _file_util.hdfs_test_url(file_path,'e',self.environment.hadoop_conf_dir)
     if _file_util.is_s3_path(file_path):
         return _file_util.s3_test_url(file_path,self.environment.ec2_config.get_credentials())
     else:
         return _os.path.exists(file_path)
    def _read_commander_init_status_file(self):
        commander_file_path = self._get_commander_file_path()

        local_file_name = _tempfile.mktemp(prefix='dml_file_')
        try:
            if _file_util.is_hdfs_path(commander_file_path):
                _file_util.download_from_hdfs(
                    commander_file_path,
                    local_file_name,
                    hadoop_conf_dir = self.environment.hadoop_conf_dir)
            elif _file_util.is_s3_path(commander_file_path):
                _file_util.download_from_s3(
                    commander_file_path,
                    local_file_name,
                    aws_credentials = self.environment.get_credentials(),
                    silent = True)

            with open(local_file_name,'r') as f:
                status_json = _json.load(f)
                port = status_json['port']
                host_name = status_json['host_name']

            if port > 0:
                return 'http://%s:%s' % (host_name, port)
            else:
                return None
        except:
            # Ignore exception, we will fail after a few retry
            pass
        finally:
            if _os.path.exists(local_file_name):
                _os.remove(local_file_name)
Ejemplo n.º 4
0
    def __get_log_file_content(self, url, handler):
        """
        Get and return the log file content
        """
        log_file_path = str(handler.get_argument("log", None))
        job = self.__load_job()
        content = ""
        max_size = long(1048576)  # max size is 1mb
        status_code = 200
        if log_file_path:
            try:
                if _file_util.is_local_path(log_file_path):
                    if _os.path.getsize(log_file_path) > max_size:
                        raise RuntimeError(
                            "Cannot read file greater than max size.")
                    else:
                        content = self.__load_local_log_file(log_file_path)
                elif _file_util.is_s3_path(log_file_path):
                    content = _file_util.read_file_to_string_s3(
                        log_file_path, max_size,
                        job.environment.get_credentials())
                elif _file_util.is_hdfs_path(log_file_path):
                    content = _file_util.read_file_to_string_hdfs(
                        log_file_path, max_size,
                        job.environment.hadoop_conf_dir)
                else:
                    status_code = 404
                    content = "Log file path (%s) is not valid." % log_file_path
            except RuntimeError:
                status_code = 413
                content = "File size too large. Please load log file manually at %s." % log_file_path

        handler.set_status(status_code)
        handler.set_header("Content-Type", "text/plain")
        handler.write(content)
Ejemplo n.º 5
0
    def __get_log_file_path_list(self, url, handler):
        """
        Returns a list of log file path for this job
        """
        job = self.__load_job()

        # get the directory that contains all the logs
        log_file_path = job.get_log_file_path()
        path_list = []

        # list the directory to get the full path to each log
        if _file_util.is_s3_path(log_file_path):
            ec2_log_list = _file_util.list_s3(log_file_path,
                                              job.environment.get_credentials())
            if ec2_log_list and len(ec2_log_list) > 0:
                path_list.extend([log['path'] for log in ec2_log_list])
        elif _file_util.is_hdfs_path(log_file_path):
            hdfs_log_list = _file_util.list_hdfs(log_file_path,
                                                   job.environment.hadoop_conf_dir)
            if hdfs_log_list and len(hdfs_log_list) > 0:
                path_list.extend([log['path'] for log in hdfs_log_list])
        else:
            path_list.append(log_file_path)

        handler.write({'log_file_list': path_list})
Ejemplo n.º 6
0
    def __get_log_file_content(self, url, handler):
        """
        Get and return the log file content
        """
        log_file_path = str(handler.get_argument("log", None))
        job = self.__load_job()
        content = ""
        max_size = 1048576L # max size is 1mb
        status_code = 200
        if log_file_path:
            try:
                if _file_util.is_local_path(log_file_path):
                    if _os.path.getsize(log_file_path) > max_size:
                        raise RuntimeError("Cannot read file greater than max size.")
                    else:
                        content = self.__load_local_log_file(log_file_path)
                elif _file_util.is_s3_path(log_file_path):
                    content = _file_util.read_file_to_string_s3(log_file_path,
                                                        max_size,
                                                        job.environment.get_credentials())
                elif _file_util.is_hdfs_path(log_file_path):
                    content = _file_util.read_file_to_string_hdfs(log_file_path,
                                                        max_size,
                                                        job.environment.hadoop_conf_dir)
                else:
                    status_code = 404
                    content = "Log file path (%s) is not valid." % log_file_path
            except RuntimeError:
                status_code = 413
                content = "File size too large. Please load log file manually at %s." % log_file_path

        handler.set_status(status_code)
        handler.set_header("Content-Type", "text/plain")
        handler.write(content)
Ejemplo n.º 7
0
 def _test_url(self,file_path):
     if _file_util.is_hdfs_path(file_path):
         return _file_util.hdfs_test_url(file_path,'e',self.environment.hadoop_conf_dir)
     if _file_util.is_s3_path(file_path):
         return _file_util.s3_test_url(file_path,self.environment.ec2_config.get_credentials())
     else:
         return _os.path.exists(file_path)
    def save(self, path, aws_credentials = {}):
        """ Save predictive object to the given path

        Parameters
        ----------
        path : str
          The location to save the predictive object to
        """
        # only support saving to local or S3 for now
        if (not (fu.is_s3_path(path) or \
                 fu.is_local_path(path) or \
                 fu.is_hdfs_path(path))):
            raise RuntimeError("Only save to local and S3 path is supported, cannot \
              save predictive object to path %s. " % path)

        if fu.is_local_path(path) and os.path.exists(path):
            if os.path.exists(path):
                logging.warning("Overwriting existing file '%s' when saving predictive object" % path)
                rm_fn = os.remove if os.path.isfile(path) else shutil.rmtree
                rm_fn(path)

        if fu.is_local_path(path):
            self._save_local(path)
        else:
            self._save_remote(path, aws_credentials)

        tracker = _mt._get_metric_tracker()
        tracker.track('deploy.predictive_service.predictive_object',
            value=1,
            properties={
                'type': self.__class__.__name__
            }
        )
Ejemplo n.º 9
0
    def __get_log_file_path_list(self, url, handler):
        """
        Returns a list of log file path for this job
        """
        job = self.__load_job()

        # get the directory that contains all the logs
        log_file_path = job.get_log_file_path()
        path_list = []

        # list the directory to get the full path to each log
        if _file_util.is_s3_path(log_file_path):
            ec2_log_list = _file_util.list_s3(
                log_file_path, job.environment.get_credentials())
            if ec2_log_list and len(ec2_log_list) > 0:
                path_list.extend([log['path'] for log in ec2_log_list])
        elif _file_util.is_hdfs_path(log_file_path):
            hdfs_log_list = _file_util.list_hdfs(
                log_file_path, job.environment.hadoop_conf_dir)
            if hdfs_log_list and len(hdfs_log_list) > 0:
                path_list.extend([log['path'] for log in hdfs_log_list])
        else:
            path_list.append(log_file_path)

        handler.write({'log_file_list': path_list})
    def _load_remote(cls, path, schema_version, aws_credentials={}):
        temp_dir = _tempfie.mkdtemp(prefix='predictive_object_')

        if fu.is_s3_path(path):
            fu.download_from_s3(path, temp_dir, is_dir=(schema_version > 2),
                              aws_credentials=aws_credentials)
        elif fu.is_hdfs_path(path):
            fu.download_from_hdfs(path, temp_dir, is_dir=(schema_version > 2))
        else:
            assert 'Only support S3 and HDFS path for Predictive Object saving location!'

        return cls._load_local(temp_dir)
Ejemplo n.º 11
0
    def _load_remote(cls, path, schema_version, aws_credentials={}):
        temp_dir = _gl.util._make_temp_filename(prefix='predictive_policy_')

        if _file_util.is_s3_path(path):
            _file_util.download_from_s3(path, temp_dir, is_dir=True,
                              aws_credentials=aws_credentials, silent=True)
        elif _file_util.is_hdfs_path(path):
            _file_util.download_from_hdfs(path, temp_dir, is_dir=True)
        else:
            assert 'Only support S3 and HDFS path for Predictive Object saving location!'

        return cls._load_local(temp_dir)
 def _save_remote(self, path, aws_credentials):
     '''Save current predictive object to S3
     '''
     tempdir = _tempfie.mkdtemp(prefix='predictive_object_')
     try:
         self._save_local(tempdir)
         if fu.is_s3_path(path):
             fu.upload_to_s3(tempdir, path, is_dir=True, aws_credentials = aws_credentials)
         elif fu.is_hdfs_path(path):
             fu.hdfs_mkdir(path)
             fu.upload_to_hdfs(tempdir + '/*', path)
     finally:
         shutil.rmtree(tempdir)
Ejemplo n.º 13
0
 def _upload_folder_to_remote(self, local, remote):
     if _file_util.is_s3_path(remote):
         _file_util.upload_to_s3(
             local,
             remote,
             is_dir = True,
             aws_credentials = self.environment.get_credentials(),
             silent = True)
     elif _file_util.is_hdfs_path(remote):
         _file_util.upload_folder_to_hdfs(
             local,
             remote,
             self.environment.hadoop_conf_dir)
Ejemplo n.º 14
0
    def _save_remote(self, path, aws_credentials):
        tempdir = _gl.util._make_temp_filename(prefix='predictive_policy_')

        try:
            self._save_local(tempdir)
            if _file_util.is_s3_path(path):
                _file_util.upload_to_s3(tempdir, path, is_dir=True, \
                                        aws_credentials = aws_credentials)
            elif _file_util.is_hdfs_path(path):
                _file_util.hdfs_mkdir(path)
                _file_util.upload_to_hdfs(tempdir + '/*', path)
        finally:
            _shutil.rmtree(tempdir)
Ejemplo n.º 15
0
    def _load_file_and_parse(self, file_name, parser_func, silent=False, test_url=True):
        '''
        Read remote file to a local temporary file, and use parser_func
        to parse the content, returns the parsed result.

        This function is used for parsing state and progress files from
        either local, S3 or HDFS.

        If there is any exception happened, returns None
        '''
        file_is_local = _file_util.is_local_path(file_name)
        local_file_name = file_name if file_is_local else _tempfile.mktemp(prefix='job-status-')

        try:
            try:
                if test_url and not self._test_url(file_name):
                    if not silent:
                        __LOGGER__.info("File %s is not available yet." % file_name)
                    return None

                if _file_util.is_hdfs_path(file_name):

                    _file_util.download_from_hdfs(
                        hdfs_path = file_name,
                        local_path = local_file_name,
                        hadoop_conf_dir=self.environment.hadoop_conf_dir)

                elif _file_util.is_s3_path(file_name):

                    _file_util.download_from_s3(
                        s3_path = file_name,
                        local_path = local_file_name,
                        is_dir = False,
                        aws_credentials = self.environment.ec2_config.get_credentials(),
                        silent = silent)

            except Exception as e:
                # It is ok the status file is not ready yet as the job is getting prepared
                if not silent:
                    __LOGGER__.warning("Exception encountered when trying to download file from %s, error: %s" % (file_name, e))
                return None

            try:
                # parse the local file
                return parser_func(local_file_name)
            except Exception as e:
                __LOGGER__.info("Exception when parsing file %s. Error: %s" % (file_name, e))
                return None
        finally:
            if (not file_is_local) and _os.path.exists(local_file_name):
                _os.remove(local_file_name)
Ejemplo n.º 16
0
    def _load_file_and_parse(self, file_name, parser_func, silent=False, test_url=True):
        '''
        Read remote file to a local temporary file, and use parser_func
        to parse the content, returns the parsed result.

        This function is used for parsing state and progress files from
        either local, S3 or HDFS.

        If there is any exception happened, returns None
        '''
        file_is_local = _file_util.is_local_path(file_name)
        local_file_name = file_name if file_is_local else _tempfile.mktemp(prefix='job-status-')

        try:
            try:
                if test_url and not self._test_url(file_name):
                    if not silent:
                        __LOGGER__.info("File %s is not available yet." % file_name)
                    return None

                if _file_util.is_hdfs_path(file_name):

                    _file_util.download_from_hdfs(
                        hdfs_path = file_name,
                        local_path = local_file_name,
                        hadoop_conf_dir=self.environment.hadoop_conf_dir)

                elif _file_util.is_s3_path(file_name):

                    _file_util.download_from_s3(
                        s3_path = file_name,
                        local_path = local_file_name,
                        is_dir = False,
                        aws_credentials = self.environment.ec2_config.get_credentials(),
                        silent = silent)

            except Exception as e:
                # It is ok the status file is not ready yet as the job is getting prepared
                if not silent:
                    __LOGGER__.warning("Exception encountered when trying to download file from %s, error: %s" % (file_name, e))
                return None

            try:
                # parse the local file
                return parser_func(local_file_name)
            except Exception as e:
                __LOGGER__.info("Exception when parsing file %s. Error: %s" % (file_name, e))
                return None
        finally:
            if (not file_is_local) and _os.path.exists(local_file_name):
                _os.remove(local_file_name)
Ejemplo n.º 17
0
    def _download_remote_folder_to_local(self, remote_path, silent=False):
        '''
        Download all files from remote path to local. Caller is responsible for
        cleaning up the local folder after finishing usage

        Returns the local temporary folder
        '''
        local_path = _tempfile.mkdtemp(prefix='job-results')

        try:
            if _file_util.is_hdfs_path(remote_path):

                _file_util.download_from_hdfs(
                    hdfs_path = remote_path,
                    local_path = local_path,
                    is_dir = True,
                    hadoop_conf_dir=self.environment.hadoop_conf_dir)

            elif _file_util.is_s3_path(remote_path):

                _file_util.download_from_s3(
                    s3_path = remote_path,
                    local_path = local_path,
                    is_dir = True,
                    aws_credentials = self.environment.ec2_config.get_credentials(),
                    silent = silent)
            else:
                raise RuntimeError("'%s' is not a supported remote path. Only S3 and HDFS"
                                    " remote path are supported" % remote_path)
        except:
            # Make sure we cleanup local files if we cannot successfully
            # download files
            if _os.path.isdir(local_path):
                _shutil.rmtree(local_path)

            raise

        return local_path
Ejemplo n.º 18
0
    def _download_remote_folder_to_local(self, remote_path, silent=False):
        '''
        Download all files from remote path to local. Caller is responsible for
        cleaning up the local folder after finishing usage

        Returns the local temporary folder
        '''
        local_path = _tempfile.mkdtemp(prefix='job-results')

        try:
            if _file_util.is_hdfs_path(remote_path):

                _file_util.download_from_hdfs(
                    hdfs_path = remote_path,
                    local_path = local_path,
                    is_dir = True,
                    hadoop_conf_dir=self.environment.hadoop_conf_dir)

            elif _file_util.is_s3_path(remote_path):

                _file_util.download_from_s3(
                    s3_path = remote_path,
                    local_path = local_path,
                    is_dir = True,
                    aws_credentials = self.environment.ec2_config.get_credentials(),
                    silent = silent)
            else:
                raise RuntimeError("'%s' is not a supported remote path. Only S3 and HDFS"
                                    " remote path are supported" % remote_path)
        except:
            # Make sure we cleanup local files if we cannot successfully
            # download files
            if _os.path.isdir(local_path):
                _shutil.rmtree(local_path)

            raise

        return local_path
Ejemplo n.º 19
0
def dml_exec(function_name, data, env='auto', verbose=True, **kwargs):
    """
    Executes a distributed ml function

    Parameters
    ----------
    function_name : str
        Name of the distributed function to be executed. The function symbol
        must exists in the unity distributed shared library.

    data : dict
        Key value arguments to the function stored in a dictionary

    env : DMLEnvironemnt
        Contains job environment parameters and a job submit function.

    **kwargs : dict
        Additional options.
        See _get_worker_args and _get_commander_args.
            - check_hdfs : {0, 1} Perform sanity check for hdfs read and write
            - startup_timeout : int Timeout in seconds for cluster setup

    Return
    ------
    (success, message, result_path) : bool, str, str
    """
    from graphlab.extensions import dml_function_invocation, init_dml_class_registry
    init_dml_class_registry()

    if env == 'auto':
        env = DMLRemoteEnvironment()

    if not file_util.exists(env.working_dir):
        _log.debug('Creating working directory: %s' % env.working_dir)
        file_util.mkdir(env.working_dir)
    else:
        _log.debug('Using existing working directory: %s' % env.working_dir)

    _log.info('Running distributed execution with %d workers. Working directory: %s' % (env.num_workers, env.working_dir))

    success = False
    message = ""
    result_path = None

    # Job function arguments
    try:
        _log.info('Serializing arguments to %s' % env.working_dir)
        args = dml_function_invocation()
        data_copy = copy(data)
        internal_working_dir = _make_internal_url(env.working_dir)
        data_copy['__base_path__'] = internal_working_dir
        args.from_dict(data_copy, internal_working_dir)
        json_data = args.to_str()

        # sanitize the base path url
    
        sanitized_json_data = json_data
        if file_util.is_s3_path(json_data): 
            sanitized_json_data = _sanitize_internal_s3_url(json_data)   

        _log.info('Serialized arguments: %s' % sanitized_json_data)
    except Exception as e:
        success = False
        message = 'Error serializing arguments. %s' % str(e)
        return (success, message, None)

    # Submit job
    try:
        job = dml_submit(function_name, json_data, env,
                         metric_server_address_file=COMMANDER_LOG_SERVER_ADDRESS_FILE,
                         logprogress_file=PROGRESS_LOG_FILE,
                         **kwargs)
    except KeyboardInterrupt:
        message = 'Canceled by user'
        return (success, message, None)

    _log.info('Waiting for workers to start ... ')
    logprinter = None
    if verbose:
        log_server_address_path = os.path.join(env.working_dir,
                                               COMMANDER_LOG_SERVER_ADDRESS_FILE)
        log_server_address = get_log_metric_server_address(log_server_address_path,
                                                           timeout=INIT_TIMEOUT_PER_WORKER * env.num_workers)
        if len(log_server_address) > 0:
            tmp_log_dir = tempfile.mkdtemp(prefix='graphlab_dml_log_')
            fd_list = []
            logprinter = LogPrinter()
            # Attach log progress stream
            logprinter.add_stream(LogStream(log_server_address + '/progress',
                                            os.path.join(env.working_dir, PROGRESS_LOG_FILE),
                                            sys.stdout))
            # Attach commander log stream
            local_commander_log = open(os.path.join(tmp_log_dir, COMMANDER_LOG_FILE), 'w')
            fd_list.append(local_commander_log)
            logprinter.add_stream(LogStream(log_server_address + '/commander',
                                            os.path.join(env.working_dir, COMMANDER_LOG_FILE),
                                            local_commander_log))
            # Attach worker log streams
            for i in range(env.num_workers):
                local_worker_log = open(os.path.join(tmp_log_dir, WORKER_LOG_FILE(i)), 'w')
                fd_list.append(local_worker_log)
                logprinter.add_stream(LogStream(log_server_address + '/worker%d' % i,
                                                os.path.join(env.working_dir, WORKER_LOG_FILE(i)),
                                                local_worker_log))
            logprinter.start()
            _log.info('Success. Worker logs are avaiable at %s ' % tmp_log_dir)

    _log.debug('Wait for job to finish')
    (success, message) = _wait_and_parse_job_result(job)

    if logprinter:
        logprinter.stop()
        for fd in fd_list:
            fd.close()

    if success:
        try:
            result_path = os.path.join(env.working_dir, env.output_name)
            ret_str = file_util.read(result_path)
            sanitized_ret_str = _sanitize_internal_s3_url(ret_str)
            _log.debug('Deserializing results: %s' % sanitized_ret_str)

            args.from_str(ret_str)
            response = args.to_dict()

            # Check toolkit response for "result" key or "exception" key.
            if 'result' in response:
                return (success, message, response['result'])
            elif 'exception' in response:
                return (False, response['exception'], None)
            else:
                raise ValueError('Invalid toolkit response. Must have "result" or \
                                 "exception" as key')
        except Exception as e:
            success = False
            message = 'Error deserializing results. %s' % str(e)
            return (success, message, None)
    else:
        return (success, message, None)
Ejemplo n.º 20
0
def copy_ec2_predictive_object(source_ps, target_ps, source_po_name, target_po_name=None, update=False):
    '''
    Copy a predictive object from a source Predictive Service to a target
    Predictive Service.

    Parameters
    ----------
    source_ps : Predictive Service object
        The source Predictive Service that holds the predictive object specified
        in source_po_name.

    target_ps : Predictive Service object
        The target Predictive Service that will accept the predictive object
        copied from the source Predictive Service.

    source_po_name : str
        The name of the predictive object to be copied. Must exist on the
        source Predictive Service.

    target_po_name : str, optional
        The name of the predictive object to be stored to the target Predictive
        Service. If target_po_name is None, the target Predictive Service would use
        source_po_name as the predictive object name. Default value is None.

    update : boolean, optional
        If a predictive object already exists on the target Predictive Service
        with the name specified by target_po_name, set this to True if you want to
        update the existing predictive object on the target Predictive Service
        with the predictive object from the source Predictive Service. Otherwise,
        leave this to the default value False to prevent update.

    Notes
    -----
    This operation will by-pass `apply_changes` operation on the target Predictive
    Service to add/update the predictive object.

    Examples
    --------
    To copy a predictive object named 'recommender' from a source Predictive
    Service to a target Predictive Service:

        >>> gl.deploy.predictive_service.copy_predictive_object(source_ps, target_ps, 'recommender')

    To update the 'recommender' predictive object on the target Predictive Service
    with the 'recommender' predictive object from the source Predictive Service:

        >>> gl.deploy.predictive_service.copy_predictive_object(source_ps, target_ps, 'recommender', update=True)

    To copy the 'recommender' predictive object from the source Predictive Service
    to the target Predictive Service and rename it 'rec':

        >>> gl.deploy.predictive_service.copy_predictive_object(source_ps, target_ps, 'recommender', 'rec')

    '''
    if not source_ps or type(source_ps) is not _PredictiveService:
        raise ValueError("Invalid source Predictive Service.")
    source_ps._ensure_not_terminated()

    if not target_ps or type(target_ps) is not _PredictiveService:
        raise ValueError("Invalid target Predictive Service.")

    target_ps._ensure_not_terminated()

    # make sure both predictive services are deployed on AWS
    if not _file_util.is_s3_path(source_ps._state_path) or not _file_util.is_s3_path(target_ps._state_path):
        raise ValueError("Both source and target Predictive Services must be deployed on EC2")

    # if source is version 1, fail
    if source_ps._schema_version == 1:
        raise ValueError("The Predictive Service that you are trying to " \
                         "load is running version 1, which is no " \
                         "longer supported. Please re-create your " \
                         "Predictive Service using your current version " \
                         "of GraphLab Create.")

    # if source is newer than target, fail
    if source_ps._schema_version > target_ps._schema_version:
        raise ValueError("Cannot copy from a version %d Predictive Service " \
                         "to a version %d Predictive Service." % \
                         (source_ps._schema_version, target_ps._schema_version))

    if target_ps._schema_version != PREDICTIVE_SERVICE_SCHEMA_VERSION:
        raise RuntimeError('Target Predictive Service has schema version %s, '
            'copy_predictive_object is only supported if target Predictive Service '
            'is of schema version %s' % (target_ps._schema_version, PREDICTIVE_SERVICE_SCHEMA_VERSION))

    # make sure no extra local changes
    target_ps._ensure_no_local_changes()

    if source_po_name not in source_ps.deployed_predictive_objects:
        raise ValueError("No predictive object named \"%s\" in the source " \
                         "Predictive Service (%s)" % (str(source_po_name), str(source_ps.name)))

    # set the target predictive object name
    target_po_name = source_po_name if not target_po_name else target_po_name

    # get the version for the target predictive service
    if target_po_name in target_ps.deployed_predictive_objects:
        if update is False:
            raise RuntimeError("Cannot update the predictive object %s in the target Predictive Service." \
                            "Please set update to True if you want to update this predictive object in the" \
                            "target Predictive Service." % target_po_name)

        target_version = 1 + target_ps.deployed_predictive_objects[target_po_name]['version']
    else:
        target_version = 1

    # get predictive object info
    source_po_info = source_ps._endpoints[source_po_name]

    po_info = {'version': target_version,
               'docstring': source_po_info['docstring'],
               'cache_state': source_po_info['cache_state'],
               'schema_version': source_po_info['schema_version'],
               'type': source_po_info.get('type', 'model'),
               'description': source_po_info['description']}

    # get path for predictive objects
    if source_po_info.get('type', 'model') == 'model':
        # check if source po is directory or file
        is_dir = True
        if source_po_info['schema_version'] < 3:
            is_dir = False

        source_path = source_ps._get_predictive_object_save_path(source_po_name, source_po_info['version'])
        target_path = target_ps._get_predictive_object_save_path(target_po_name, target_version)

        # compare credentials
        _check_aws_credentials(source_ps._environment.aws_credentials,
                               target_ps._environment.aws_credentials, source_path)

        # intra s3 copy model
        _file_util.intra_s3_copy_model(source_path, target_path, is_dir, target_ps._environment.aws_credentials)

    # add po_info to target_ps
    target_ps._endpoints[target_po_name] = po_info

    # save state to s3
    target_ps._save_state()
    try:
        target_ps._environment.poke()
    except _ConnectionError as e:
        _logger.warn("Unable to connect to target Predictive Service: %s" %
                     (e.message))
    target_ps._update_local_state()
    _logger.info("Successfully copied predictive object \"%s\" from Predictive Service (%s) " \
                 "to Predictive Service (%s)." % (str(source_po_name),
                 str(source_ps.name), str(target_ps.name)))
Ejemplo n.º 21
0
def _load_imp(state_path, aws_access_key_id, aws_secret_access_key):
    '''
    Internal implmentation of the load, used by both external facing load and by
    internal facing load (gl.deploy.predictive_services[name])
    '''
    aws_credentials = None
    if _file_util.is_s3_path(state_path):
        # Save the credentials.
        if bool(aws_access_key_id) != bool(aws_secret_access_key):
            raise IOError('Either both aws_access_key_id and aws_secret_access_key ' \
                          'should be specified or neither should be specified.')
        if not aws_access_key_id and not aws_secret_access_key:
            try:
                aws_access_key_id, aws_secret_access_key = _get_credentials()
            except:
                raise IOError('No AWS credentials set. Credentials must either be ' \
                              'passed in, or set globally using ' \
                              'graphlab.aws.set_credentials(...).')
        aws_credentials = {
            'aws_access_key_id': aws_access_key_id,
            'aws_secret_access_key': aws_secret_access_key
        }

    elif (not _file_util.is_hdfs_path(state_path)) and (not _file_util.is_local_path(state_path)):
        raise ValueError("Invalid state path. Predictive Service only supports loading \
                        state path from S3, HDFS or Local file path.")

    config = _PredictiveServiceEnvironment._get_state_from_file(state_path, aws_credentials)
    name = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, 'Name')
    description = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, 'Description')
    api_key = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, 'API Key')
    admin_key = config.get(_PredictiveService._ENVIRONMENT_SECTION_NAME, 'admin_key')
    # For backwards compatibility. Port used to be hard-coded as 9005 and does not
    # exist in the config.
    if (config.has_option(_PredictiveService._ENVIRONMENT_SECTION_NAME, 'port')):
        port = int(config.get(_PredictiveService._ENVIRONMENT_SECTION_NAME, 'port'))
    else:
        port = _PORT_DEFAULT_NUM

    global_cache_state = 'enabled'
    if _CACHE_STATE_SECTION_NAME_ in config.options(_PredictiveService._SERVICE_INFO_SECTION_NAME):
        global_cache_state = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, _CACHE_STATE_SECTION_NAME_)

    cors_origin = ''
    if _CORS_ORIGIN_SECTION_NAME_ in config.options(_PredictiveService._SERVICE_INFO_SECTION_NAME):
        cors_origin = config.get(_PredictiveService._SERVICE_INFO_SECTION_NAME, _CORS_ORIGIN_SECTION_NAME_)

    system_config = _SystemConfig.from_config_parser(
        config, _PredictiveService._SYSTEM_SECTION_NAME)

    result = _PredictiveService(name, state_path, description, api_key, admin_key,
                                aws_credentials,
                                _new_service=False, cors_origin=cors_origin,
                                global_cache_state=global_cache_state,
                                system_config=system_config,
                                port = port)

    # create environment
    environment_info = dict(config.items(_PredictiveService._ENVIRONMENT_SECTION_NAME))
    if aws_credentials:
        environment_info['aws_credentials'] = aws_credentials
    result._environment = _predictive_service_environment_factory(environment_info)

    # get latest state
    result._get_latest_state()

    return result