def init(repo_path: Text, pipelines_dir: Text = None, analytics_opt_in: bool = None): """Initialize ZenML on given path.""" if repo_path is None: repo_path = os.getcwd() if analytics_opt_in is None: analytics_opt_in = confirmation( "ZenML collects anonymized usage information. This data helps us " "create a better product and understand the needs of the " "community better. You can find more information about exactly " "why, what and how we collect usage analytics statistics at: " "https://docs.zenml.io/misc/usage-analytics. " "Would you like to opt-in to usage analytics?") try: Repository.init_repo( repo_path, None, None, pipelines_dir, analytics_opt_in, ) click.echo(f'ZenML repo initialized at {repo_path}') except git.InvalidGitRepositoryError: click.echo(f'{repo_path} is not a valid git repository! Please ' f'initialize ZenML within a git repository.')
def register_pipeline(self, config: Dict[Text, Any]): """ Registers a pipeline in the artifact store as a YAML file. Args: config: dict representation of ZenML config. """ Repository.get_instance().register_pipeline(file_name=self.file_name, config=config)
def _check_registered(self): if self.file_name in \ Repository.get_instance().get_pipeline_file_paths( only_file_names=True): raise AssertionError( f'Pipeline names must be unique in the repository. There ' f'is already a pipeline called {self.name}')
def __init__(self, name: Text, schema: Dict = None, _id: Text = None, *args, **kwargs): """ Construct the datasource Args: name (str): name of datasource schema (dict): schema of datasource _id: unique ID (for internal use) """ if _id: # Its loaded from config self._id = _id logger.debug(f'Datasource {name} loaded.') else: # If none, then this is assumed to be 'new'. Check dupes. all_names = Repository.get_instance().get_datasource_names() if any(d == name for d in all_names): raise AlreadyExistsException( name=name, resource_type='datasource') self._id = str(uuid4()) track(event=CREATE_DATASOURCE) logger.info(f'Datasource {name} created.') self.name = name self.schema = schema self._immutable = False self._source = source_utils.resolve_source_path( self.__class__.__module__ + '.' + self.__class__.__name__ )
def run(self, config: Dict[Text, Any]): # Extract the paths to create the tar logger.info('Orchestrating pipeline on Kubernetes..') repo: Repository = Repository.get_instance() repo_path = repo.path config_dir = repo.zenml_config.config_dir tar_file_name = \ f'{EXTRACTED_TAR_DIR_NAME}_{str(int(time.time()))}.tar.gz' path_to_tar = os.path.join(config_dir, tar_file_name) # Create tarfile but exclude .zenml folder if exists path_utils.create_tarfile(repo_path, path_to_tar) logger.info(f'Created tar of current repository at: {path_to_tar}') # Upload tar to artifact store store_path = config[keys.GlobalKeys.ARTIFACT_STORE] store_staging_area = os.path.join(store_path, STAGING_AREA) store_path_to_tar = os.path.join(store_staging_area, tar_file_name) path_utils.copy(path_to_tar, store_path_to_tar) logger.info(f'Copied tar to artifact store at: {store_path_to_tar}') # Remove tar path_utils.rm_dir(path_to_tar) logger.info(f'Removed tar at: {path_to_tar}') # Append path of tar in config orchestrator utils config[keys.GlobalKeys.BACKEND][ keys.BackendKeys.ARGS][TAR_PATH_ARG] = store_path_to_tar # Launch the instance self.launch_job(config)
def get_datasource_by_name(repo: Repository, datasource_name: Text): """ Gets pipeline from current repository by matching a name identifier against the data source name. """ pretty_print(repo.get_datasource_by_name(datasource_name))
def _get_one_pipeline(self): """Gets representative pipeline from all pipelines associated.""" pipelines = \ Repository.get_instance().get_pipelines_by_datasource(self) if len(pipelines) == 0: raise EmptyDatasourceException return pipelines[0]
def wrapper(): repo: Repository = Repository.get_instance() pipelines_dir = repo.zenml_config.get_pipelines_dir() for p_config in path_utils.list_dir(pipelines_dir): try: os.remove(p_config) except Exception as e: print(e)
def wrapper(): repo: Repository = Repository.get_instance() repo.zenml_config.set_pipelines_dir(pipeline_root) for p_config in path_utils.list_dir(pipeline_root): y = yaml_utils.read_yaml(p_config) p: TrainingPipeline = TrainingPipeline.from_config(y) p.run()
def list_config(): """Print the current ZenML config to the command line""" try: repo: Repository = Repository.get_instance() except Exception as e: error(e) return click.echo(to_pretty_string(repo.zenml_config))
def _get_one_pipeline(self): """Gets representative pipeline from all pipelines associated.""" pipelines = \ Repository.get_instance().get_pipelines_by_datasource(self) if len(pipelines) == 0: raise Exception('This datasource is not associated with any ' 'pipelines, therefore there is no data!') return pipelines[0]
def list_steps(repo: Repository): step_versions = repo.get_step_versions() name_version_data = [] headers = ["step_name", "step_version"] for name, version_set in step_versions.items(): names = [name] * len(version_set) versions = list(version_set) name_version_data.extend(list(zip(names, versions))) click.echo(tabulate(name_version_data, headers=headers))
def get_pipeline_by_name(repo: Repository, pipeline_name: Text): """ Gets pipeline from current repository by matching a name against a pipeline name in the repository. """ try: p = repo.get_pipeline_by_name(pipeline_name) except Exception as e: error(e) return pretty_print(p)
def load_source_path_class(source_path: Text) -> Type: """ Loads a Python class from the path provided. Args: source_path (str): relative module path e.g. this.module.Class[@sha] """ source = source_path.split('@')[0] pin = source_path.split('@')[-1] is_standard = is_standard_pin(pin) if '@' in source_path and not is_standard: logger.debug('Pinned step found with git sha. ' 'Loading class from git history.') wrapper: GitWrapper = Repository.get_instance().get_git_wrapper() module_path = get_module_path_from_source(source_path) relative_module_path = get_relative_path_from_module(module_path) logger.warning('Found source with a pinned sha. Will now checkout ' f'module: {module_path}') # critical step if not wrapper.check_module_clean(source_path): raise Exception(f'One of the files at {relative_module_path} ' f'is not committed and we ' f'are trying to load that directory from git ' f'history due to a pinned step in the pipeline. ' f'Please commit the file and then run the ' f'pipeline.') # Check out the directory at that sha wrapper.checkout(sha_or_branch=pin, directory=relative_module_path) # After this point, all exceptions will first undo the above try: class_ = import_class_by_path(source) wrapper.reset(relative_module_path) wrapper.checkout(directory=relative_module_path) except Exception: wrapper.reset(relative_module_path) wrapper.checkout(directory=relative_module_path) raise Exception elif '@' in source_path and is_standard: logger.debug(f'Default {APP_NAME} class used. Loading directly.') # TODO: [LOW] Check if ZenML version is installed before loading. class_ = import_class_by_path(source) else: logger.debug('Unpinned step found with no git sha. Attempting to ' 'load class from current repository state.') class_ = import_class_by_path(source) return class_
def run_pipeline(self, config_b64: str): # Load config from base64 config = json.loads(base64.b64decode(config_b64)) # Remove tar_path arg from config tar_path = config[keys.GlobalKeys.BACKEND][keys.BackendKeys.ARGS].pop( TAR_PATH_ARG) # Copy it over locally because it will be remote path_utils.copy(tar_path, EXTRACTED_TAR_FILE_PATH) # Extract it to EXTRACTED_TAR_DIR path_utils.extract_tarfile(EXTRACTED_TAR_FILE_PATH, EXTRACTED_TAR_DIR) # Append to sys to make user code discoverable sys.path.append(EXTRACTED_TAR_DIR) # Make sure the Repository is initialized at the right path Repository.get_instance(EXTRACTED_TAR_DIR) # Change orchestrator of pipeline to local OrchestratorBaseBackend().run(config)
def resolve_source_path(source_path: Text) -> Text: """ Resolves source path with an optional sha using Git. Args: source_path (str): relative module path e.g. this.module.Class """ if is_standard_step(source_path): # that means use standard version return resolve_standard_source_path(source_path) # otherwise use Git resolution wrapper: GitWrapper = Repository.get_instance().get_git_wrapper() source_path = wrapper.resolve_source_path(source_path) return source_path
def get_config(self): predictor_path = self.predictor.__module__ + '.' + \ self.predictor.__name__ p_file_path = \ get_path_from_source(get_class_path_from_source(predictor_path)) repo: Repository = Repository.get_instance() return { "cortex_serving_args": { "env": self.env, "api_config": self.api_config, "predictor_path": os.path.join(repo.path, p_file_path), "requirements": self.requirements, "conda_packages": self.conda_packages, "force": self.force, "wait": self.wait, } }
def list_pipelines(repo: Repository): """Lists pipelines in the current repository.""" try: pipelines = repo.get_pipelines() names = [p.name for p in pipelines] types = [p.PIPELINE_TYPE for p in pipelines] statuses = [p.get_status() for p in pipelines] cache_enabled = [p.enable_cache for p in pipelines] filenames = [p.file_name for p in pipelines] headers = ["name", "type", "cache enabled", "status", "file name"] click.echo( tabulate(zip(names, types, cache_enabled, statuses, filenames), headers=headers)) except Exception as e: error(e)
def set_metadata_store(store_type, args): """Set metadata store for local config.""" try: parsed_args = parse_unknown_options(args) except AssertionError as e: click.echo(str(e)) return # TODO: [LOW] Hard-coded config = {'type': store_type, 'args': parsed_args} from zenml.core.metadata.metadata_wrapper import ZenMLMetadataStore store = ZenMLMetadataStore.from_config(config) repo: Repository = Repository.get_instance() repo.zenml_config.set_metadata_store(store) click.echo(f'Metadata store set to: {store.to_config()}')
def run(self, config: Dict[Text, Any]): """ This run function essentially calls an underlying TFX orchestrator run. However it is meant as a higher level abstraction with some opinionated decisions taken. Args: config: a ZenML config dict """ # Extract the paths to create the tar logger.info('Orchestrating pipeline on GCP..') repo: Repository = Repository.get_instance() repo_path = repo.path config_dir = repo.zenml_config.config_dir tar_file_name = \ f'{EXTRACTED_TAR_DIR_NAME}_{str(int(time.time()))}.tar.gz' path_to_tar = os.path.join(config_dir, tar_file_name) # Create tarfile but excluse .zenml folder if exists path_utils.create_tarfile(repo_path, path_to_tar) logger.info(f'Created tar of current repository at: {path_to_tar}') # Upload tar to artifact store store_path = config[keys.GlobalKeys.ARTIFACT_STORE] store_staging_area = os.path.join(store_path, STAGING_AREA) store_path_to_tar = os.path.join(store_staging_area, tar_file_name) path_utils.copy(path_to_tar, store_path_to_tar) logger.info(f'Copied tar to artifact store at: {store_path_to_tar}') # Remove tar path_utils.rm_dir(path_to_tar) logger.info(f'Removed tar at: {path_to_tar}') # Append path of tar in config orchestrator utils config[keys.GlobalKeys.BACKEND][ keys.BackendKeys.ARGS][TAR_PATH_ARG] = store_path_to_tar # Launch the instance self.launch_instance(config)
def __init__(self, **params): super(Application, self).__init__(**params) # lists result_list = [] hparam_list = [] repo: Repository = Repository.get_instance() # get all pipelines in this workspace all_pipelines: List[TrainingPipeline] = repo.get_pipelines_by_type( [TrainingPipeline.PIPELINE_TYPE]) # get a dataframe of all results + all hyperparameter combinations for p in all_pipelines: # This is slowing the comparison down but # necessary to update the status of each run if p.get_status() == PipelineStatusTypes.Succeeded.name: eval_path = p.get_artifacts_uri_by_component( GDPComponent.Evaluator.name)[0] evaluation = tfma.load_eval_result(eval_path) for s, m in evaluation.slicing_metrics: result_list.append( dict([('pipeline_name', '{}'.format(p.name)), ('slice_name', s[0][0] if s else ''), ('slice_value', s[0][1] if s else '')])) result_list[-1].update( {f'metric_{k}': m[k][''] for k, v in m.items()}) h_dict = p.get_hyperparameters() h_dict['pipeline_name'] = p.name hparam_list.append(h_dict) self.results = pd.DataFrame([parse_metrics(r) for r in result_list]) self.hparam_info = pd.DataFrame(hparam_list) # set params self.param.pipeline_run_selector.objects = self.results[ 'pipeline_name'].unique()
import zenml import shutil from zenml.core.repo.repo import Repository from zenml.core.repo.zenml_config import ZenMLConfig, PIPELINES_DIR_KEY from zenml.utils.exceptions import InitializationException from zenml.utils import yaml_utils from zenml.core.standards import standard_keys as keys from zenml.core.repo.constants import ARTIFACT_STORE_DEFAULT_DIR, \ ZENML_DIR_NAME, ML_METADATA_SQLITE_DEFAULT_NAME from zenml.core.metadata.mock_metadata_wrapper import MockMetadataStore ZENML_ROOT = zenml.__path__[0] TEST_ROOT = os.path.join(ZENML_ROOT, "testing") pipelines_dir = os.path.join(TEST_ROOT, "test_pipelines") repo: Repository = Repository.get_instance() repo.zenml_config.set_pipelines_dir(pipelines_dir) config_root = os.path.dirname(ZENML_ROOT) artifact_store_path = os.path.join(config_root, ZENML_DIR_NAME, ARTIFACT_STORE_DEFAULT_DIR) sqlite_uri = os.path.join(artifact_store_path, ML_METADATA_SQLITE_DEFAULT_NAME) def test_zenml_config_init(): # in the root initialization should work _ = ZenMLConfig(config_root) # outside of an initialized repo path with pytest.raises(InitializationException):
def __init__(self, name: Text, enable_cache: Optional[bool] = True, steps_dict: Dict[Text, BaseStep] = None, backends_dict: Dict[Text, BaseBackend] = None, metadata_store: Optional[ZenMLMetadataStore] = None, artifact_store: Optional[ArtifactStore] = None, datasource: Optional[BaseDatasource] = None, pipeline_name: Optional[Text] = None, *args, **kwargs): """ Construct a base pipeline. This is a base interface that is meant to be overridden in multiple other pipeline use cases. Args: name: Outward-facing name of the pipeline. pipeline_name: A unique name that identifies the pipeline after it is run. enable_cache: Boolean, indicates whether or not caching should be used. steps_dict: Optional dict of steps. backends_dict: Optional dict of backends metadata_store: Configured metadata store. If None, the default metadata store is used. artifact_store: Configured artifact store. If None, the default artifact store is used. """ self.name = name # Metadata store if metadata_store: self.metadata_store: ZenMLMetadataStore = metadata_store else: # use default self.metadata_store: ZenMLMetadataStore = \ Repository.get_instance().get_default_metadata_store() if pipeline_name: # This means its been loaded in through YAML, try to get context if self.is_executed_in_metadata_store: self._immutable = True logger.debug(f'Pipeline {name} loaded and and is immutable.') else: # if metadata store does not have the pipeline_name, then we # can safely execute this again. self._immutable = False logger.debug(f'Pipeline {name} loaded and can be run.') self.pipeline_name = pipeline_name self.file_name = self.pipeline_name + '.yaml' else: # if pipeline_name is None then its a new pipeline self._immutable = False self.pipeline_name = self.create_pipeline_name_from_name() self.file_name = self.pipeline_name + '.yaml' # check duplicates here as its a 'new' pipeline if self.file_name in \ Repository.get_instance().get_pipeline_file_paths( only_file_names=True): raise AssertionError( f'Pipeline names must be unique in the repository. There ' f'is already a pipeline called {self.name}') track(event=CREATE_PIPELINE) logger.info(f'Pipeline {name} created.') self.enable_cache = enable_cache if steps_dict is None: self.steps_dict: Dict[Text, BaseStep] = {} else: self.steps_dict = steps_dict # Backends if backends_dict is None: self.backends_dict: Dict[Text, BaseBackend] = \ self.get_default_backends() else: self.backends_dict = backends_dict # Artifact store if artifact_store: self.artifact_store = artifact_store else: # use default self.artifact_store = \ Repository.get_instance().get_default_artifact_store() # Datasource if datasource: self.datasource = datasource else: self.datasource = None
def load_config(self) -> Dict[Text, Any]: """Loads a config dict from yaml file.""" return Repository.get_instance().load_pipeline_config( file_name=self.file_name)
CORTEX_MODEL_NAME = os.getenv('CORTEX_MODEL_NAME', 'zenml-classifier') # For this example, the ArtifactStore must be a GCP bucket, as the # CortexDeployer step is using the GCP env. from zenml.core.repo.repo import Repository # Define the training pipeline training_pipeline = TrainingPipeline() # Add a datasource. This will automatically track and version it. try: ds = CSVDatasource(name='Pima Indians Diabetes', path='gs://zenml_quickstart/diabetes.csv') except AlreadyExistsException: ds = Repository.get_instance().get_datasource_by_name( 'Pima Indians Diabetes') training_pipeline.add_datasource(ds) # Add a split training_pipeline.add_split(RandomSplit(split_map={'eval': 0.3, 'train': 0.7})) # Add a preprocessing unit training_pipeline.add_preprocesser( StandardPreprocesser(features=[ 'times_pregnant', 'pgc', 'dbp', 'tst', 'insulin', 'bmi', 'pedigree', 'age' ], labels=['has_diabetes'], overwrite={ 'has_diabetes': { 'transform': [{
from zenml.core.steps.preprocesser.standard_preprocesser \ .standard_preprocesser import \ StandardPreprocesser from zenml.core.steps.split.categorical_domain_split_step import \ CategoricalDomainSplit from zenml.core.steps.trainer.tensorflow_trainers.tf_ff_trainer import \ FeedForwardTrainer from zenml.utils import path_utils from zenml.utils.logger import get_logger logger = get_logger(__name__) # reset pipeline root to redirect to tests so that it writes the yamls there ZENML_ROOT = str(Path(zenml.__path__[0]).parent) TEST_ROOT = os.path.join(ZENML_ROOT, "tests") Repository.init_repo(TEST_ROOT, analytics_opt_in=False) pipeline_root = os.path.join(TEST_ROOT, "pipelines") csv_root = os.path.join(TEST_ROOT, "test_data") image_root = os.path.join(csv_root, "images") repo: Repository = Repository.get_instance() if path_utils.is_dir(pipeline_root): path_utils.rm_dir(pipeline_root) repo.zenml_config.set_pipelines_dir(pipeline_root) try: for i in range(1, 6): training_pipeline = TrainingPipeline(name='csvtest{0}'.format(i)) try:
def wrapper(filename): repo: Repository = Repository.get_instance() repo.zenml_config.set_pipelines_dir(pipeline_root) cfg = os.path.join(pipeline_root, filename) path_utils.rm_file(cfg)
def set_metadata_store(): """Compares pipelines in repo""" click.echo('Comparing pipelines in repo: Starting app..') repo: Repository = Repository.get_instance() repo.compare_pipelines()
def compare_pipelines(repo: Repository): """Compares pipelines in repo""" click.echo('Comparing pipelines in repo: Starting app..') repo.compare_pipelines()
def list_datasources(repo: Repository): datasources = repo.get_datasources() click.echo(tabulate([ds.to_config() for ds in datasources], headers="keys"))