Beispiel #1
0
def remove_package(turi_dist_path, filename, hadoop_conf_dir=None):
    '''
    Remove a package from the available packages for this Hadoop Turi Distributed
    installation. This package will no longer be available for installation.

    Parameters
    -----------
    turi_dist_path : str
        The location where Turi Distributed is installed. This usually comes from
        your Hadoop Administrator. This path must be a valid HDFS path.

    filename :  str
        File name of the package to remove from the Turi Distributed installation

    hadoop_conf_dir : str, optional
        Hadoop configure directory where Hadoop configuration files are stored.
        If not given, the configuration file is automatically searched in your
        CLASSPATH. hadoop_conf_dir must be a local file path.

    '''
    hadoop_conf_dir = _file_util.expand_full_path(
        hadoop_conf_dir) if hadoop_conf_dir else None
    _validate_turi_distr_param(turi_dist_path, hadoop_conf_dir)

    full = turi_dist_path + HadoopCluster._DIST_USER_PKG + "/" + filename
    _file_util.remove_hdfs(full, hadoop_conf_dir=hadoop_conf_dir)
def remove_package(dato_dist_path, filename, hadoop_conf_dir = None):
    '''
    Remove a package from the available packages for this Hadoop Dato Distributed
    installation. This package will no longer be available for installation.

    Parameters
    -----------
    dato_dist_path : str
        The location where Dato Distributed is installed. This usually comes from
        your Hadoop Administrator. This path must be a valid HDFS path.

    filename :  str
        File name of the package to remove from the Dato Distributed installation

    hadoop_conf_dir : str, optional
        Hadoop configure directory where Hadoop configuration files are stored.
        If not given, the configuration file is automatically searched in your
        CLASSPATH. hadoop_conf_dir must be a local file path.

    '''
    hadoop_conf_dir = _file_util.expand_full_path(hadoop_conf_dir) if hadoop_conf_dir else None
    _validate_dato_distr_param(dato_dist_path, hadoop_conf_dir)

    full = dato_dist_path + HadoopCluster._DIST_USER_PKG + "/" + filename
    _file_util.remove_hdfs(full, hadoop_conf_dir=hadoop_conf_dir)
Beispiel #3
0
def upload_packages(turi_dist_path,
                    filename_or_dir,
                    hadoop_conf_dir=None,
                    force=False):
    '''
    Upload a package to the available packages for this Hadoop Turi Distributed
    installation.  Files must be a valid PyPI package.  You may download packages
    from PyPI with the commands

        >>> mkdir <directory_name>
        >>> pip install --download <directory_name> <package-name>

        then

        >>> graphlab.deploy.hadoop_cluster.upload_packages(<turi_dist_path>, <path_to_directory>)

    These packages will be available for future work on the cluster.

    Parameters
    -----------
    turi_dist_path : str
        The location where Turi Distributed is installed. This usually comes from
        your Hadoop Administrator. This path must be a valid HDFS path.

    filename_or_dir :  str
        A file or directory containing files to upload, the file(s) must be a
        correct package for your target host's operating system in your Hadoop
        setup.

    hadoop_conf_dir : str, optional
        Hadoop configure directory where Hadoop configuration files are stored.
        If not given, the configuration file is automatically searched in your
        CLASSPATH. hadoop_conf_dir must be a local file path.

    force: boolean, optional
        Boolean, whether to force overwrite if file exists

    Returns
    -------
    '''
    hadoop_conf_dir = _file_util.expand_full_path(
        hadoop_conf_dir) if hadoop_conf_dir else None
    _validate_turi_distr_param(turi_dist_path, hadoop_conf_dir)

    dest = turi_dist_path + HadoopCluster._DIST_USER_PKG
    if _os.path.isdir(filename_or_dir):
        for root, directories, filenames in _os.walk(filename_or_dir):
            for f in filenames:
                full = _os.path.join(root, f)
                _file_util.upload_to_hdfs(full,
                                          dest,
                                          hadoop_conf_dir=hadoop_conf_dir,
                                          force=force)
    else:
        _file_util.upload_to_hdfs(filename_or_dir,
                                  dest,
                                  hadoop_conf_dir=hadoop_conf_dir,
                                  force=force)
Beispiel #4
0
def show_available_packages(turi_dist_path, hadoop_conf_dir=None):
    '''
    Show all availabe packages in Hadoop Turi Distributed installation

    turi_dist_path : str
        The location where Turi Distributed is installed. This usually comes from
        your Hadoop Administrator. This path must be a valid HDFS path.

    hadoop_conf_dir : str, optional
        Hadoop configure directory where Hadoop configuration files are stored.
        If not given, the configuration file is automatically searched in your
        CLASSPATH. hadoop_conf_dir must be a local file path.

    Returns
    -------
    out : dict
        Dict of two lists, default_packages in the format:

            "rsa==3.1.4",
            "scikit-learn==0.16.1",
            "scipy==0.15.1"
        and user_packages, additional PyPi packages which have been uploaded to the Turi Distributed
        installation.  user_packages has the format:

            "names-0.3.0.tar.gz",
            "boto-2.33.0-py2.py3-none-any.whl",
            ...

    '''
    hadoop_conf_dir = _file_util.expand_full_path(
        hadoop_conf_dir) if hadoop_conf_dir else None
    _validate_turi_distr_param(turi_dist_path, hadoop_conf_dir)

    conda_list = turi_dist_path + HadoopCluster._DIST_CONDA_LIST
    user_list = turi_dist_path + HadoopCluster._DIST_USER_PKG
    packages = _file_util.read_file_to_string_hdfs(
        conda_list, hadoop_conf_dir=hadoop_conf_dir)
    if packages is None:
        raise RuntimeError(
            "It seems like you do not have a valid Turi Distributed"
            " installation. Please contact your Hadoop administrator.")

    lines = packages.split(_os.linesep)
    output_lines = []
    for l in lines:
        splited = l.split()
        if len(splited) == 3:
            output_lines.append('%s==%s' % (splited[0], splited[1]))

    result = {'default_packages': output_lines}
    user_add = _file_util.list_hdfs(user_list, hadoop_conf_dir=hadoop_conf_dir)
    user = [_os.path.basename(x['path']) for x in user_add]
    result['user_packages'] = user
    return result
def upload_packages(
    dato_dist_path, filename_or_dir,
    hadoop_conf_dir = None, force=False):
    '''
    Upload a package to the available packages for this Hadoop Dato Distributed
    installation.  Files must be a valid PyPI package.  You may download packages
    from PyPI with the commands

        >>> mkdir <directory_name>
        >>> pip install --download <directory_name> <package-name>

        then

        >>> graphlab.deploy.hadoop_cluster.upload_packages(<dato_dist_path>, <path_to_directory>)

    These packages will be available for future work on the cluster.

    Parameters
    -----------
    dato_dist_path : str
        The location where Dato Distributed is installed. This usually comes from
        your Hadoop Administrator. This path must be a valid HDFS path.

    filename_or_dir :  str
        A file or directory containing files to upload, the file(s) must be a
        correct package for your target host's operating system in your Hadoop
        setup.

    hadoop_conf_dir : str, optional
        Hadoop configure directory where Hadoop configuration files are stored.
        If not given, the configuration file is automatically searched in your
        CLASSPATH. hadoop_conf_dir must be a local file path.

    force: boolean, optional
        Boolean, whether to force overwrite if file exists

    Returns
    -------
    '''
    hadoop_conf_dir = _file_util.expand_full_path(hadoop_conf_dir) if hadoop_conf_dir else None
    _validate_dato_distr_param(dato_dist_path, hadoop_conf_dir)

    dest = dato_dist_path + HadoopCluster._DIST_USER_PKG
    if _os.path.isdir(filename_or_dir):
        for root, directories, filenames in _os.walk(filename_or_dir):
            for f in filenames:
                full= _os.path.join(root, f)
                _file_util.upload_to_hdfs(full, dest,
                    hadoop_conf_dir=hadoop_conf_dir, force=force)
    else:
        _file_util.upload_to_hdfs(filename_or_dir, dest,
            hadoop_conf_dir=hadoop_conf_dir, force=force)
def show_available_packages(dato_dist_path, hadoop_conf_dir = None):
    '''
    Show all availabe packages in Hadoop Dato Distributed installation

    dato_dist_path : str
        The location where Dato Distributed is installed. This usually comes from
        your Hadoop Administrator. This path must be a valid HDFS path.

    hadoop_conf_dir : str, optional
        Hadoop configure directory where Hadoop configuration files are stored.
        If not given, the configuration file is automatically searched in your
        CLASSPATH. hadoop_conf_dir must be a local file path.

    Returns
    -------
    out : dict
        Dict of two lists, default_packages in the format:

            "rsa==3.1.4",
            "scikit-learn==0.16.1",
            "scipy==0.15.1"
        and user_packages, additional PyPi packages which have been uploaded to the Dato Distributed
        installation.  user_packages has the format:

            "names-0.3.0.tar.gz",
            "boto-2.33.0-py2.py3-none-any.whl",
            ...

    '''
    hadoop_conf_dir = _file_util.expand_full_path(hadoop_conf_dir) if hadoop_conf_dir else None
    _validate_dato_distr_param(dato_dist_path, hadoop_conf_dir)

    conda_list = dato_dist_path + HadoopCluster._DIST_CONDA_LIST
    user_list = dato_dist_path + HadoopCluster._DIST_USER_PKG
    packages = _file_util.read_file_to_string_hdfs(conda_list, hadoop_conf_dir=hadoop_conf_dir)
    if packages is None:
        raise RuntimeError("It seems like you do not have a valid Dato Distributed"
        " installation. Please contact your Hadoop administrator.")

    lines = packages.split(_os.linesep)
    output_lines = []
    for l in lines:
        splited = l.split()
        if len(splited) == 3:
            output_lines.append('%s==%s' % (splited[0], splited[1]))

    result = {'default_packages': output_lines}
    user_add = _file_util.list_hdfs(user_list, hadoop_conf_dir=hadoop_conf_dir)
    user = [_os.path.basename(x['path']) for x in user_add]
    result['user_packages'] = user
    return result
    def _load_local(cls, path):
        path = _file_util.expand_full_path(path)
        if not _os.path.exists(path):
            raise RuntimeError("Path %s does not exist." % path)

        try:
            unpickler = _gl_pickle.GLUnpickler(path)
            schema_version = unpickler.load()

            loaded_policy = unpickler.load()
            unpickler.close()

        except Exception as e:
            raise RuntimeError('Unable to load policy. Error: %s' % e)

        return loaded_policy
    def _load_local(cls, path):
        path = fu.expand_full_path(path)
        if not os.path.exists(path):
            raise RuntimeError("Path %s does not exist." % path)

        try:
            unpickler = _gl_pickle.GLUnpickler(path)
            po_schema_version = unpickler.load()
            required_files = unpickler.load()

            # layout the required files before loading the function
            # Required files are moved to be managed separatly from the Predictive
            # object since schema version 4 (jumped to 6 to be inline with PS version)
            # so no longer need to deserialize the required files as part of load
            if po_schema_version <= 3:
                cls._deserialize_required_files(required_files)
            else:
                # Do not need to load the dependent files after schema version 3
                # because the loading is handled in higher level by Predictive
                # service
                pass
            po_obj = unpickler.load()
            unpickler.close()

        except Exception as e:
            import traceback
            trace = traceback.format_exc()
            err_msg = "Traceback\n %s\n" % trace
            err_msg += "Error type    : %s\n" % e.__class__.__name__
            err_msg += "Error message : %s\n" % str(e)

            raise RuntimeError('Unable to load predictive object. Error: %s' % err_msg)

        if po_schema_version > PREDICTIVE_OBJECT_SCHEMA_VERSION:
            raise RuntimeError("Your GraphLab Create only supports Predictive "
                "Objects with schema version up to '%s', the Predictive Object "
                "you are trying to load has schema version '%s'. Please upgrade your "
                "GraphLab Create version to most up-to-date one." % \
                (PREDICTIVE_OBJECT_SCHEMA_VERSION, po_schema_version))

        return po_obj
def create(
        name,
        dato_dist_path,
        hadoop_conf_dir = None,
        num_containers = None, container_size = None, num_vcores = None,
        start_port = None, end_port = None,
        additional_packages = None):
    '''
    Create a Dato Distributed cluster in Hadoop.

    Parameters
    -----------

    name : str
        A name for newly created cluster, this name is used in your local session
        so that you may easily load it back using:

            >>> import graphlab
            >>> graphlab.deploy.environments[<cluster-name>]

    dato_dist_path : str
        The location where Dato Distributed is installed. This usually comes from
        your Hadoop Administrator. This path must be a valid HDFS path.

    hadoop_conf_dir : str, optional
        Hadoop configure directory where Hadoop configuration files are stored.
        If not given, the configuration file is automatically searched in your
        CLASSPATH. hadoop_conf_dir must be a local file path.

    num_containers : int, optional
        The number of containers to use for this environment. If not given,
        default value is decided by you Dato Distributed administrator when
        installing Dato Distributed.

    container_size : int, optional
        The memory in MB required for job execution.  If not given,
        default value is decided by you Dato Distributed administrator when
        installing Dato Distributed.

    num_vcores : int, optional
        The number of virtuals cores to use, must be at least two. If not given,
        default value is decided by you Dato Distributed administrator when
        installing Dato Distributed.

    additional_packages : list [str], optional
        Additional packages you want to use in your Hadoop Execution Environment.
        The package can be in one of the following format:

            <package-name>
            <packge-name>=<package-version>
            <package-name>==<package-version>

        You may use either Conda package or Pypi package.

        Any package you listed here must have been added to the packagage list by
        your Hadoop Administrator to the Dato Distributed installation. You may
        use the following command to list all available packages in the Dato
        Distributed installation:

            >>> import graphlab
            >>> hadoop_cluster.show_available_packages()


    Returns
    --------
    cluster | a HadoopCluster object
        A handle to the Hadoop execution environment for your distributed job
        execution

    Examples
    ---------

    To create a Hadoop cluster execution environment using default configuration:

        >>> import graphlab
        >>> hadoop_cluster = graphlab.deploy.hadoop_cluster.create(
        ...         name = 'my-first-cluster',
        ...         dato_dist_path = 'hdfs://namenode:port/pdato-dist-path'
        )

    To create a Hadoop cluster execution environment with all explicit config:

        >>> import graphlab
        >>> hadoop_cluster = graphlab.deploy.hadoop_cluster.create(
        ...         name = 'my-first-cluster',
        ...         dato_dist_path = 'hdfs://namenode:port/pdato-dist-path'
        ...         hadoop_conf_dir = '<path-to-hadoop-conf>',
        ...         num_containers = 10,
        ...         num_vcores = 8,
        ...         container_size = 8192,
        ...         additional_packages = ['nltk==3.0.3']
        )

    '''

    hadoop_conf_dir = _file_util.expand_full_path(hadoop_conf_dir) if hadoop_conf_dir else None
    _validate_dato_distr_param(dato_dist_path, hadoop_conf_dir)

    if not isinstance(name, basestring):
        raise TypeError('Cluster name has to be a string.')

    if additional_packages is not None:
        if isinstance(additional_packages, basestring) :
            additional_packages = [additional_packages]

        if not hasattr(additional_packages, '__iter__'):
            raise TypeError('"additional_packages" parameter has to be iterable.')

    # Now create a HadoopCluster object
    cluster = HadoopCluster(name, dato_dist_path,  hadoop_conf_dir,
        num_containers, container_size, num_vcores,
        additional_packages)

    # Save to local session and overwrite if exists
    if cluster._session.exists(cluster.name, HadoopCluster._typename):
        _log.warning('Overwritting existing Hadoop Cluster "%s" in local session.' % cluster.name)
        _gl.deploy.environments.delete(cluster, silent=True)
    _gl.deploy._default_session.register(cluster)
    cluster.save()

    return cluster
Beispiel #10
0
def load_model(location):
    """
    Load any GraphLab Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    GraphLab Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = gl.load_model('my_model_file')
    """
    _mt._get_metric_tracker().track('toolkit.model.load_model')

    # Check if the location is a dir_archive, if not, use glunpickler to load
    # as pure python model

    # We need to fix this sometime, but here is the explanation of the stupid
    # check below:
    #
    # If the location is a http location, skip the check, and directly proceed
    # to load model as dir_archive. This is because
    # 1) exists() does not work with http protocol, and
    # 2) GLUnpickler does not support http
    protocol = file_util.get_protocol(location)
    dir_archive_exists = False
    if protocol == '':
        model_path = file_util.expand_full_path(location)
        dir_archive_exists = file_util.exists(
            os.path.join(model_path, 'dir_archive.ini'))
    else:
        model_path = location
        if protocol in ['http', 'https']:
            dir_archive_exists = True
        else:
            import posixpath
            dir_archive_exists = file_util.exists(
                posixpath.join(model_path, 'dir_archive.ini'))

    if not dir_archive_exists:
        # Not a ToolkitError so try unpickling the model.
        unpickler = gl_pickle.GLUnpickler(location)

        # Get the version
        version = unpickler.load()

        # Load the class name.
        cls_name = unpickler.load()
        cls = _get_class_from_name(cls_name)

        # Load the object with the right version.
        model = cls._load_version(unpickler, version)

        unpickler.close()

        # Return the model
        return model
    else:
        _internal_url = _make_internal_url(location)
        return glconnect.get_unity().load_model(_internal_url)
Beispiel #11
0
def create(name,
           turi_dist_path,
           hadoop_conf_dir=None,
           num_containers=None,
           container_size=None,
           num_vcores=None,
           start_port=None,
           end_port=None,
           additional_packages=None):
    '''
    Create a Turi Distributed cluster in Hadoop.

    Parameters
    -----------

    name : str
        A name for newly created cluster, this name is used in your local session
        so that you may easily load it back using:

            >>> import graphlab
            >>> graphlab.deploy.environments[<cluster-name>]

    turi_dist_path : str
        The location where Turi Distributed is installed. This usually comes from
        your Hadoop Administrator. This path must be a valid HDFS path.

    hadoop_conf_dir : str, optional
        Hadoop configure directory where Hadoop configuration files are stored.
        If not given, the configuration file is automatically searched in your
        CLASSPATH. hadoop_conf_dir must be a local file path.

    num_containers : int, optional
        The number of containers to use for this environment. If not given,
        default value is decided by you Turi Distributed administrator when
        installing Turi Distributed.

    container_size : int, optional
        The memory in MB required for job execution.  If not given,
        default value is decided by you Turi Distributed administrator when
        installing Turi Distributed.

    num_vcores : int, optional
        The number of virtuals cores to use, must be at least two. If not given,
        default value is decided by you Turi Distributed administrator when
        installing Turi Distributed.

    additional_packages : list [str], optional
        Additional packages you want to use in your Hadoop Execution Environment.
        The package can be in one of the following format:

            <package-name>
            <packge-name>=<package-version>
            <package-name>==<package-version>

        You may use either Conda package or Pypi package.

        Any package you listed here must have been added to the package list by
        your Hadoop Administrator to the Turi Distributed installation. You may
        use the following command to list all available packages in the Turi
        Distributed installation:

            >>> import graphlab
            >>> hadoop_cluster.show_available_packages()


    Returns
    --------
    cluster | a HadoopCluster object
        A handle to the Hadoop execution environment for your distributed job
        execution

    Examples
    ---------

    To create a Hadoop cluster execution environment using default configuration:

        >>> import graphlab
        >>> hadoop_cluster = graphlab.deploy.hadoop_cluster.create(
        ...         name = 'my-first-cluster',
        ...         turi_dist_path = 'hdfs://namenode:port/turi-dist-path'
        )

    To create a Hadoop cluster execution environment with all explicit config:

        >>> import graphlab
        >>> hadoop_cluster = graphlab.deploy.hadoop_cluster.create(
        ...         name = 'my-first-cluster',
        ...         turi_dist_path = 'hdfs://namenode:port/turi-dist-path'
        ...         hadoop_conf_dir = '<path-to-hadoop-conf>',
        ...         num_containers = 10,
        ...         num_vcores = 8,
        ...         container_size = 8192,
        ...         additional_packages = ['nltk==3.0.3']
        )

    '''

    hadoop_conf_dir = _file_util.expand_full_path(
        hadoop_conf_dir) if hadoop_conf_dir else None
    _validate_turi_distr_param(turi_dist_path, hadoop_conf_dir)

    if not isinstance(name, basestring):
        raise TypeError('Cluster name has to be a string.')

    if additional_packages is not None:
        if isinstance(additional_packages, basestring):
            additional_packages = [additional_packages]

        if not hasattr(additional_packages, '__iter__'):
            raise TypeError(
                '"additional_packages" parameter has to be iterable.')

    # Now create a HadoopCluster object
    cluster = HadoopCluster(name, turi_dist_path, hadoop_conf_dir,
                            num_containers, container_size, num_vcores,
                            additional_packages)

    # Save to local session and overwrite if exists
    if cluster._session.exists(cluster.name, HadoopCluster._typename):
        _log.warning(
            'Overwriting existing Hadoop Cluster "%s" in local session.' %
            cluster.name)
        _gl.deploy.environments.delete(cluster, silent=True)
    _gl.deploy._default_session.register(cluster)
    cluster.save()

    return cluster