def remove_package(turi_dist_path, filename, hadoop_conf_dir=None): ''' Remove a package from the available packages for this Hadoop Turi Distributed installation. This package will no longer be available for installation. Parameters ----------- turi_dist_path : str The location where Turi Distributed is installed. This usually comes from your Hadoop Administrator. This path must be a valid HDFS path. filename : str File name of the package to remove from the Turi Distributed installation hadoop_conf_dir : str, optional Hadoop configure directory where Hadoop configuration files are stored. If not given, the configuration file is automatically searched in your CLASSPATH. hadoop_conf_dir must be a local file path. ''' hadoop_conf_dir = _file_util.expand_full_path( hadoop_conf_dir) if hadoop_conf_dir else None _validate_turi_distr_param(turi_dist_path, hadoop_conf_dir) full = turi_dist_path + HadoopCluster._DIST_USER_PKG + "/" + filename _file_util.remove_hdfs(full, hadoop_conf_dir=hadoop_conf_dir)
def remove_package(dato_dist_path, filename, hadoop_conf_dir = None): ''' Remove a package from the available packages for this Hadoop Dato Distributed installation. This package will no longer be available for installation. Parameters ----------- dato_dist_path : str The location where Dato Distributed is installed. This usually comes from your Hadoop Administrator. This path must be a valid HDFS path. filename : str File name of the package to remove from the Dato Distributed installation hadoop_conf_dir : str, optional Hadoop configure directory where Hadoop configuration files are stored. If not given, the configuration file is automatically searched in your CLASSPATH. hadoop_conf_dir must be a local file path. ''' hadoop_conf_dir = _file_util.expand_full_path(hadoop_conf_dir) if hadoop_conf_dir else None _validate_dato_distr_param(dato_dist_path, hadoop_conf_dir) full = dato_dist_path + HadoopCluster._DIST_USER_PKG + "/" + filename _file_util.remove_hdfs(full, hadoop_conf_dir=hadoop_conf_dir)
def upload_packages(turi_dist_path, filename_or_dir, hadoop_conf_dir=None, force=False): ''' Upload a package to the available packages for this Hadoop Turi Distributed installation. Files must be a valid PyPI package. You may download packages from PyPI with the commands >>> mkdir <directory_name> >>> pip install --download <directory_name> <package-name> then >>> graphlab.deploy.hadoop_cluster.upload_packages(<turi_dist_path>, <path_to_directory>) These packages will be available for future work on the cluster. Parameters ----------- turi_dist_path : str The location where Turi Distributed is installed. This usually comes from your Hadoop Administrator. This path must be a valid HDFS path. filename_or_dir : str A file or directory containing files to upload, the file(s) must be a correct package for your target host's operating system in your Hadoop setup. hadoop_conf_dir : str, optional Hadoop configure directory where Hadoop configuration files are stored. If not given, the configuration file is automatically searched in your CLASSPATH. hadoop_conf_dir must be a local file path. force: boolean, optional Boolean, whether to force overwrite if file exists Returns ------- ''' hadoop_conf_dir = _file_util.expand_full_path( hadoop_conf_dir) if hadoop_conf_dir else None _validate_turi_distr_param(turi_dist_path, hadoop_conf_dir) dest = turi_dist_path + HadoopCluster._DIST_USER_PKG if _os.path.isdir(filename_or_dir): for root, directories, filenames in _os.walk(filename_or_dir): for f in filenames: full = _os.path.join(root, f) _file_util.upload_to_hdfs(full, dest, hadoop_conf_dir=hadoop_conf_dir, force=force) else: _file_util.upload_to_hdfs(filename_or_dir, dest, hadoop_conf_dir=hadoop_conf_dir, force=force)
def show_available_packages(turi_dist_path, hadoop_conf_dir=None): ''' Show all availabe packages in Hadoop Turi Distributed installation turi_dist_path : str The location where Turi Distributed is installed. This usually comes from your Hadoop Administrator. This path must be a valid HDFS path. hadoop_conf_dir : str, optional Hadoop configure directory where Hadoop configuration files are stored. If not given, the configuration file is automatically searched in your CLASSPATH. hadoop_conf_dir must be a local file path. Returns ------- out : dict Dict of two lists, default_packages in the format: "rsa==3.1.4", "scikit-learn==0.16.1", "scipy==0.15.1" and user_packages, additional PyPi packages which have been uploaded to the Turi Distributed installation. user_packages has the format: "names-0.3.0.tar.gz", "boto-2.33.0-py2.py3-none-any.whl", ... ''' hadoop_conf_dir = _file_util.expand_full_path( hadoop_conf_dir) if hadoop_conf_dir else None _validate_turi_distr_param(turi_dist_path, hadoop_conf_dir) conda_list = turi_dist_path + HadoopCluster._DIST_CONDA_LIST user_list = turi_dist_path + HadoopCluster._DIST_USER_PKG packages = _file_util.read_file_to_string_hdfs( conda_list, hadoop_conf_dir=hadoop_conf_dir) if packages is None: raise RuntimeError( "It seems like you do not have a valid Turi Distributed" " installation. Please contact your Hadoop administrator.") lines = packages.split(_os.linesep) output_lines = [] for l in lines: splited = l.split() if len(splited) == 3: output_lines.append('%s==%s' % (splited[0], splited[1])) result = {'default_packages': output_lines} user_add = _file_util.list_hdfs(user_list, hadoop_conf_dir=hadoop_conf_dir) user = [_os.path.basename(x['path']) for x in user_add] result['user_packages'] = user return result
def upload_packages( dato_dist_path, filename_or_dir, hadoop_conf_dir = None, force=False): ''' Upload a package to the available packages for this Hadoop Dato Distributed installation. Files must be a valid PyPI package. You may download packages from PyPI with the commands >>> mkdir <directory_name> >>> pip install --download <directory_name> <package-name> then >>> graphlab.deploy.hadoop_cluster.upload_packages(<dato_dist_path>, <path_to_directory>) These packages will be available for future work on the cluster. Parameters ----------- dato_dist_path : str The location where Dato Distributed is installed. This usually comes from your Hadoop Administrator. This path must be a valid HDFS path. filename_or_dir : str A file or directory containing files to upload, the file(s) must be a correct package for your target host's operating system in your Hadoop setup. hadoop_conf_dir : str, optional Hadoop configure directory where Hadoop configuration files are stored. If not given, the configuration file is automatically searched in your CLASSPATH. hadoop_conf_dir must be a local file path. force: boolean, optional Boolean, whether to force overwrite if file exists Returns ------- ''' hadoop_conf_dir = _file_util.expand_full_path(hadoop_conf_dir) if hadoop_conf_dir else None _validate_dato_distr_param(dato_dist_path, hadoop_conf_dir) dest = dato_dist_path + HadoopCluster._DIST_USER_PKG if _os.path.isdir(filename_or_dir): for root, directories, filenames in _os.walk(filename_or_dir): for f in filenames: full= _os.path.join(root, f) _file_util.upload_to_hdfs(full, dest, hadoop_conf_dir=hadoop_conf_dir, force=force) else: _file_util.upload_to_hdfs(filename_or_dir, dest, hadoop_conf_dir=hadoop_conf_dir, force=force)
def show_available_packages(dato_dist_path, hadoop_conf_dir = None): ''' Show all availabe packages in Hadoop Dato Distributed installation dato_dist_path : str The location where Dato Distributed is installed. This usually comes from your Hadoop Administrator. This path must be a valid HDFS path. hadoop_conf_dir : str, optional Hadoop configure directory where Hadoop configuration files are stored. If not given, the configuration file is automatically searched in your CLASSPATH. hadoop_conf_dir must be a local file path. Returns ------- out : dict Dict of two lists, default_packages in the format: "rsa==3.1.4", "scikit-learn==0.16.1", "scipy==0.15.1" and user_packages, additional PyPi packages which have been uploaded to the Dato Distributed installation. user_packages has the format: "names-0.3.0.tar.gz", "boto-2.33.0-py2.py3-none-any.whl", ... ''' hadoop_conf_dir = _file_util.expand_full_path(hadoop_conf_dir) if hadoop_conf_dir else None _validate_dato_distr_param(dato_dist_path, hadoop_conf_dir) conda_list = dato_dist_path + HadoopCluster._DIST_CONDA_LIST user_list = dato_dist_path + HadoopCluster._DIST_USER_PKG packages = _file_util.read_file_to_string_hdfs(conda_list, hadoop_conf_dir=hadoop_conf_dir) if packages is None: raise RuntimeError("It seems like you do not have a valid Dato Distributed" " installation. Please contact your Hadoop administrator.") lines = packages.split(_os.linesep) output_lines = [] for l in lines: splited = l.split() if len(splited) == 3: output_lines.append('%s==%s' % (splited[0], splited[1])) result = {'default_packages': output_lines} user_add = _file_util.list_hdfs(user_list, hadoop_conf_dir=hadoop_conf_dir) user = [_os.path.basename(x['path']) for x in user_add] result['user_packages'] = user return result
def _load_local(cls, path): path = _file_util.expand_full_path(path) if not _os.path.exists(path): raise RuntimeError("Path %s does not exist." % path) try: unpickler = _gl_pickle.GLUnpickler(path) schema_version = unpickler.load() loaded_policy = unpickler.load() unpickler.close() except Exception as e: raise RuntimeError('Unable to load policy. Error: %s' % e) return loaded_policy
def _load_local(cls, path): path = fu.expand_full_path(path) if not os.path.exists(path): raise RuntimeError("Path %s does not exist." % path) try: unpickler = _gl_pickle.GLUnpickler(path) po_schema_version = unpickler.load() required_files = unpickler.load() # layout the required files before loading the function # Required files are moved to be managed separatly from the Predictive # object since schema version 4 (jumped to 6 to be inline with PS version) # so no longer need to deserialize the required files as part of load if po_schema_version <= 3: cls._deserialize_required_files(required_files) else: # Do not need to load the dependent files after schema version 3 # because the loading is handled in higher level by Predictive # service pass po_obj = unpickler.load() unpickler.close() except Exception as e: import traceback trace = traceback.format_exc() err_msg = "Traceback\n %s\n" % trace err_msg += "Error type : %s\n" % e.__class__.__name__ err_msg += "Error message : %s\n" % str(e) raise RuntimeError('Unable to load predictive object. Error: %s' % err_msg) if po_schema_version > PREDICTIVE_OBJECT_SCHEMA_VERSION: raise RuntimeError("Your GraphLab Create only supports Predictive " "Objects with schema version up to '%s', the Predictive Object " "you are trying to load has schema version '%s'. Please upgrade your " "GraphLab Create version to most up-to-date one." % \ (PREDICTIVE_OBJECT_SCHEMA_VERSION, po_schema_version)) return po_obj
def create( name, dato_dist_path, hadoop_conf_dir = None, num_containers = None, container_size = None, num_vcores = None, start_port = None, end_port = None, additional_packages = None): ''' Create a Dato Distributed cluster in Hadoop. Parameters ----------- name : str A name for newly created cluster, this name is used in your local session so that you may easily load it back using: >>> import graphlab >>> graphlab.deploy.environments[<cluster-name>] dato_dist_path : str The location where Dato Distributed is installed. This usually comes from your Hadoop Administrator. This path must be a valid HDFS path. hadoop_conf_dir : str, optional Hadoop configure directory where Hadoop configuration files are stored. If not given, the configuration file is automatically searched in your CLASSPATH. hadoop_conf_dir must be a local file path. num_containers : int, optional The number of containers to use for this environment. If not given, default value is decided by you Dato Distributed administrator when installing Dato Distributed. container_size : int, optional The memory in MB required for job execution. If not given, default value is decided by you Dato Distributed administrator when installing Dato Distributed. num_vcores : int, optional The number of virtuals cores to use, must be at least two. If not given, default value is decided by you Dato Distributed administrator when installing Dato Distributed. additional_packages : list [str], optional Additional packages you want to use in your Hadoop Execution Environment. The package can be in one of the following format: <package-name> <packge-name>=<package-version> <package-name>==<package-version> You may use either Conda package or Pypi package. Any package you listed here must have been added to the packagage list by your Hadoop Administrator to the Dato Distributed installation. You may use the following command to list all available packages in the Dato Distributed installation: >>> import graphlab >>> hadoop_cluster.show_available_packages() Returns -------- cluster | a HadoopCluster object A handle to the Hadoop execution environment for your distributed job execution Examples --------- To create a Hadoop cluster execution environment using default configuration: >>> import graphlab >>> hadoop_cluster = graphlab.deploy.hadoop_cluster.create( ... name = 'my-first-cluster', ... dato_dist_path = 'hdfs://namenode:port/pdato-dist-path' ) To create a Hadoop cluster execution environment with all explicit config: >>> import graphlab >>> hadoop_cluster = graphlab.deploy.hadoop_cluster.create( ... name = 'my-first-cluster', ... dato_dist_path = 'hdfs://namenode:port/pdato-dist-path' ... hadoop_conf_dir = '<path-to-hadoop-conf>', ... num_containers = 10, ... num_vcores = 8, ... container_size = 8192, ... additional_packages = ['nltk==3.0.3'] ) ''' hadoop_conf_dir = _file_util.expand_full_path(hadoop_conf_dir) if hadoop_conf_dir else None _validate_dato_distr_param(dato_dist_path, hadoop_conf_dir) if not isinstance(name, basestring): raise TypeError('Cluster name has to be a string.') if additional_packages is not None: if isinstance(additional_packages, basestring) : additional_packages = [additional_packages] if not hasattr(additional_packages, '__iter__'): raise TypeError('"additional_packages" parameter has to be iterable.') # Now create a HadoopCluster object cluster = HadoopCluster(name, dato_dist_path, hadoop_conf_dir, num_containers, container_size, num_vcores, additional_packages) # Save to local session and overwrite if exists if cluster._session.exists(cluster.name, HadoopCluster._typename): _log.warning('Overwritting existing Hadoop Cluster "%s" in local session.' % cluster.name) _gl.deploy.environments.delete(cluster, silent=True) _gl.deploy._default_session.register(cluster) cluster.save() return cluster
def load_model(location): """ Load any GraphLab Create model that was previously saved. This function assumes the model (can be any model) was previously saved in GraphLab Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.load_model') # Check if the location is a dir_archive, if not, use glunpickler to load # as pure python model # We need to fix this sometime, but here is the explanation of the stupid # check below: # # If the location is a http location, skip the check, and directly proceed # to load model as dir_archive. This is because # 1) exists() does not work with http protocol, and # 2) GLUnpickler does not support http protocol = file_util.get_protocol(location) dir_archive_exists = False if protocol == '': model_path = file_util.expand_full_path(location) dir_archive_exists = file_util.exists( os.path.join(model_path, 'dir_archive.ini')) else: model_path = location if protocol in ['http', 'https']: dir_archive_exists = True else: import posixpath dir_archive_exists = file_util.exists( posixpath.join(model_path, 'dir_archive.ini')) if not dir_archive_exists: # Not a ToolkitError so try unpickling the model. unpickler = gl_pickle.GLUnpickler(location) # Get the version version = unpickler.load() # Load the class name. cls_name = unpickler.load() cls = _get_class_from_name(cls_name) # Load the object with the right version. model = cls._load_version(unpickler, version) unpickler.close() # Return the model return model else: _internal_url = _make_internal_url(location) return glconnect.get_unity().load_model(_internal_url)
def create(name, turi_dist_path, hadoop_conf_dir=None, num_containers=None, container_size=None, num_vcores=None, start_port=None, end_port=None, additional_packages=None): ''' Create a Turi Distributed cluster in Hadoop. Parameters ----------- name : str A name for newly created cluster, this name is used in your local session so that you may easily load it back using: >>> import graphlab >>> graphlab.deploy.environments[<cluster-name>] turi_dist_path : str The location where Turi Distributed is installed. This usually comes from your Hadoop Administrator. This path must be a valid HDFS path. hadoop_conf_dir : str, optional Hadoop configure directory where Hadoop configuration files are stored. If not given, the configuration file is automatically searched in your CLASSPATH. hadoop_conf_dir must be a local file path. num_containers : int, optional The number of containers to use for this environment. If not given, default value is decided by you Turi Distributed administrator when installing Turi Distributed. container_size : int, optional The memory in MB required for job execution. If not given, default value is decided by you Turi Distributed administrator when installing Turi Distributed. num_vcores : int, optional The number of virtuals cores to use, must be at least two. If not given, default value is decided by you Turi Distributed administrator when installing Turi Distributed. additional_packages : list [str], optional Additional packages you want to use in your Hadoop Execution Environment. The package can be in one of the following format: <package-name> <packge-name>=<package-version> <package-name>==<package-version> You may use either Conda package or Pypi package. Any package you listed here must have been added to the package list by your Hadoop Administrator to the Turi Distributed installation. You may use the following command to list all available packages in the Turi Distributed installation: >>> import graphlab >>> hadoop_cluster.show_available_packages() Returns -------- cluster | a HadoopCluster object A handle to the Hadoop execution environment for your distributed job execution Examples --------- To create a Hadoop cluster execution environment using default configuration: >>> import graphlab >>> hadoop_cluster = graphlab.deploy.hadoop_cluster.create( ... name = 'my-first-cluster', ... turi_dist_path = 'hdfs://namenode:port/turi-dist-path' ) To create a Hadoop cluster execution environment with all explicit config: >>> import graphlab >>> hadoop_cluster = graphlab.deploy.hadoop_cluster.create( ... name = 'my-first-cluster', ... turi_dist_path = 'hdfs://namenode:port/turi-dist-path' ... hadoop_conf_dir = '<path-to-hadoop-conf>', ... num_containers = 10, ... num_vcores = 8, ... container_size = 8192, ... additional_packages = ['nltk==3.0.3'] ) ''' hadoop_conf_dir = _file_util.expand_full_path( hadoop_conf_dir) if hadoop_conf_dir else None _validate_turi_distr_param(turi_dist_path, hadoop_conf_dir) if not isinstance(name, basestring): raise TypeError('Cluster name has to be a string.') if additional_packages is not None: if isinstance(additional_packages, basestring): additional_packages = [additional_packages] if not hasattr(additional_packages, '__iter__'): raise TypeError( '"additional_packages" parameter has to be iterable.') # Now create a HadoopCluster object cluster = HadoopCluster(name, turi_dist_path, hadoop_conf_dir, num_containers, container_size, num_vcores, additional_packages) # Save to local session and overwrite if exists if cluster._session.exists(cluster.name, HadoopCluster._typename): _log.warning( 'Overwriting existing Hadoop Cluster "%s" in local session.' % cluster.name) _gl.deploy.environments.delete(cluster, silent=True) _gl.deploy._default_session.register(cluster) cluster.save() return cluster