Ejemplo n.º 1
0
def init(repo_path: Text, pipelines_dir: Text = None,
         analytics_opt_in: bool = None):
    """Initialize ZenML on given path."""
    if repo_path is None:
        repo_path = os.getcwd()

    if analytics_opt_in is None:
        analytics_opt_in = confirmation(
            "ZenML collects anonymized usage information. This data helps us "
            "create a better product and understand the needs of the "
            "community better. You can find more information about exactly "
            "why, what and how we collect usage analytics statistics at: "
            "https://docs.zenml.io/misc/usage-analytics.html. "
            "Would you like to opt-in to usage analytics?")

    try:
        Repository.init_repo(
            repo_path,
            None,
            None,
            pipelines_dir,
            analytics_opt_in,
        )
        click.echo(f'ZenML repo initialized at {repo_path}')
    except git.InvalidGitRepositoryError:
        click.echo(f'{repo_path} is not a valid git repository! Please '
                   f'initialize ZenML within a git repository.')
Ejemplo n.º 2
0
    def register_pipeline(self, config: Dict[Text, Any]):
        """
        Registers a pipeline in the artifact store as a YAML file.

        Args:
            config: dict representation of ZenML config.
        """
        self._check_registered()
        Repository.get_instance().register_pipeline(file_name=self.file_name,
                                                    config=config)
    def run(self, config: [Dict, Any]):
        # Extract the paths to create the tar
        logger.info('Orchestrating pipeline on AWS..')

        repo: Repository = Repository.get_instance()
        repo_path = repo.path
        config_dir = repo.zenml_config.config_dir
        tar_file_name = \
            f'{EXTRACTED_TAR_DIR_NAME}_{str(int(time.time()))}.tar.gz'
        path_to_tar = os.path.join(config_dir, tar_file_name)

        # Create tarfile but exclude .zenml folder if exists
        path_utils.create_tarfile(repo_path, path_to_tar)
        logger.info(f'Created tar of current repository at: {path_to_tar}')

        # Upload tar to artifact store
        store_path = config[keys.GlobalKeys.ARTIFACT_STORE]
        store_staging_area = os.path.join(store_path, STAGING_AREA)
        store_path_to_tar = os.path.join(store_staging_area, tar_file_name)
        path_utils.copy(path_to_tar, store_path_to_tar)
        logger.info(f'Copied tar to artifact store at: {store_path_to_tar}')

        # Remove tar
        path_utils.rm_dir(path_to_tar)
        logger.info(f'Removed tar at: {path_to_tar}')

        # Append path of tar in config orchestrator utils
        config[keys.GlobalKeys.BACKEND][keys.BackendKeys.ARGS][
            TAR_PATH_ARG] = store_path_to_tar

        # Launch the instance
        self.launch_instance(config)
Ejemplo n.º 4
0
    def __init__(self, name: Text, _id: Text = None, *args, **kwargs):
        """
        Construct the datasource
        Args:
            name (str): name of datasource
            schema (dict): schema of datasource
            _id: unique ID (for internal use)
        """
        if _id:
            # Its loaded from config
            self._id = _id
            logger.debug(f'Datasource {name} loaded.')
        else:
            # If none, then this is assumed to be 'new'. Check dupes.
            all_names = Repository.get_instance().get_datasource_names()
            if any(d == name for d in all_names):
                raise AlreadyExistsException(name=name,
                                             resource_type='datasource')
            self._id = str(uuid4())
            track(event=CREATE_DATASOURCE)
            logger.info(f'Datasource {name} created.')

        self.name = name
        self._immutable = False
        self._source = source_utils.resolve_class(self.__class__)
Ejemplo n.º 5
0
def get_datasource_by_name(repo: Repository, datasource_name: Text):
    """
    Gets pipeline from current repository by matching a name identifier
    against the data source name.

    """
    pretty_print(repo.get_datasource_by_name(datasource_name))
Ejemplo n.º 6
0
    def _get_one_pipeline(self):
        """Gets representative pipeline from all pipelines associated."""
        pipelines = \
            Repository.get_instance().get_pipelines_by_datasource(self)

        if len(pipelines) == 0:
            raise EmptyDatasourceException
        return pipelines[0]
Ejemplo n.º 7
0
 def wrapper():
     repo: Repository = Repository.get_instance()
     pipelines_dir = repo.zenml_config.get_pipelines_dir()
     for p_config in path_utils.list_dir(pipelines_dir):
         try:
             os.remove(p_config)
         except Exception as e:
             print(e)
Ejemplo n.º 8
0
def list_steps(repo: Repository):
    step_versions = repo.get_step_versions()
    name_version_data = []
    headers = ["step_name", "step_version"]
    for name, version_set in step_versions.items():
        names = [name] * len(version_set)
        versions = list(version_set)
        name_version_data.extend(list(zip(names, versions)))

    click.echo(tabulate(name_version_data, headers=headers))
Ejemplo n.º 9
0
def get_pipeline_by_name(repo: Repository, pipeline_name: Text):
    """
    Gets pipeline from current repository by matching a name against a
    pipeline name in the repository.
    """
    try:
        p = repo.get_pipeline_by_name(pipeline_name)
    except Exception as e:
        error(e)
        return

    pretty_print(p)
Ejemplo n.º 10
0
def load_source_path_class(source_path: Text) -> Type:
    """
    Loads a Python class from the path provided.

    Args:
        source_path (str): relative module path e.g. this.module.Class[@sha]
    """
    source = source_path.split('@')[0]
    pin = source_path.split('@')[-1]
    is_standard = is_standard_pin(pin)

    if '@' in source_path and not is_standard:
        logger.debug('Pinned step found with git sha. '
                     'Loading class from git history.')
        wrapper: GitWrapper = Repository.get_instance().get_git_wrapper()

        module_path = get_module_path_from_source(source_path)
        relative_module_path = get_relative_path_from_module(module_path)

        logger.warning('Found source with a pinned sha. Will now checkout '
                       f'module: {module_path}')

        # critical step
        if not wrapper.check_module_clean(source_path):
            raise Exception(f'One of the files at {relative_module_path} '
                            f'is not committed and we '
                            f'are trying to load that directory from git '
                            f'history due to a pinned step in the pipeline. '
                            f'Please commit the file and then run the '
                            f'pipeline.')

        # Check out the directory at that sha
        wrapper.checkout(sha_or_branch=pin, directory=relative_module_path)

        # After this point, all exceptions will first undo the above
        try:
            class_ = import_class_by_path(source)
            wrapper.reset(relative_module_path)
            wrapper.checkout(directory=relative_module_path)
        except Exception:
            wrapper.reset(relative_module_path)
            wrapper.checkout(directory=relative_module_path)
            raise Exception
    elif '@' in source_path and is_standard:
        logger.debug(f'Default {APP_NAME} class used. Loading directly.')
        # TODO: [LOW] Check if ZenML version is installed before loading.
        class_ = import_class_by_path(source)
    else:
        logger.debug('Unpinned step found with no git sha. Attempting to '
                     'load class from current repository state.')
        class_ = import_class_by_path(source)

    return class_
Ejemplo n.º 11
0
def resolve_source_path(source_path: Text) -> Text:
    """
    Resolves source path with an optional sha using Git.

    Args:
        source_path (str): relative module path e.g. this.module.Class
    """
    if is_standard_step(source_path):
        # that means use standard version
        return resolve_standard_source_path(source_path)

    # otherwise use Git resolution
    wrapper: GitWrapper = Repository.get_instance().get_git_wrapper()
    source_path = wrapper.resolve_source_path(source_path)
    return source_path
Ejemplo n.º 12
0
def list_pipelines(repo: Repository):
    """Lists pipelines in the current repository."""
    try:
        pipelines = repo.get_pipelines()

        names = [p.name for p in pipelines]
        types = [p.PIPELINE_TYPE for p in pipelines]
        statuses = [p.get_status() for p in pipelines]
        cache_enabled = [p.enable_cache for p in pipelines]
        filenames = [p.file_name for p in pipelines]

        headers = ["name", "type", "cache enabled", "status", "file name"]

        click.echo(tabulate(zip(names, types, cache_enabled,
                                statuses, filenames),
                            headers=headers))
    except Exception as e:
        error(e)
Ejemplo n.º 13
0
    def get_config(self):
        predictor_path = self.predictor.__module__ + '.' + \
                         self.predictor.__name__
        p_file_path = \
            get_path_from_source(get_class_path_from_source(predictor_path))
        repo: Repository = Repository.get_instance()

        return {
            "cortex_serving_args": {
                "env": self.env,
                "api_config": self.api_config,
                "predictor_path": os.path.join(repo.path, p_file_path),
                "requirements": self.requirements,
                "conda_packages": self.conda_packages,
                "force": self.force,
                "wait": self.wait,
            }
        }
Ejemplo n.º 14
0
def set_metadata_store(store_type, args):
    """Set metadata store for local config."""

    try:
        parsed_args = parse_unknown_options(args)
    except AssertionError as e:
        click.echo(str(e))
        return

    # TODO: [LOW] Hard-coded
    config = {'type': store_type, 'args': parsed_args}
    from zenml.metadata.metadata_wrapper import ZenMLMetadataStore

    store = ZenMLMetadataStore.from_config(config)
    repo: Repository = Repository.get_instance()
    repo.zenml_config.set_metadata_store(store)

    click.echo(f'Metadata store set to: {store.to_config()}')
Ejemplo n.º 15
0
    def __init__(self, **params):
        super(Application, self).__init__(**params)

        # lists
        result_list = []
        hparam_list = []
        repo: Repository = Repository.get_instance()

        # get all pipelines in this workspace
        all_pipelines: List[TrainingPipeline] = repo.get_pipelines_by_type(
            [TrainingPipeline.PIPELINE_TYPE])

        # get a dataframe of all results + all hyperparameter combinations
        for p in all_pipelines:
            # This is slowing the comparison down but
            # necessary to update the status of each run
            if p.get_status() == PipelineStatusTypes.Succeeded.name:
                eval_path = p.get_artifacts_uri_by_component(
                    GDPComponent.Evaluator.name)[0]

                evaluation = tfma.load_eval_result(eval_path)
                for s, m in evaluation.slicing_metrics:
                    result_list.append(
                        dict([('pipeline_name', '{}'.format(p.name)),
                              ('slice_name', s[0][0] if s else ''),
                              ('slice_value', s[0][1] if s else '')]))
                    result_list[-1].update(
                        {f'metric_{k}': m[k]['']
                         for k, v in m.items()})

                h_dict = p.get_hyperparameters()
                h_dict['pipeline_name'] = p.name
                hparam_list.append(h_dict)

        self.results = pd.DataFrame([parse_metrics(r) for r in result_list])
        self.hparam_info = pd.DataFrame(hparam_list)

        # set params
        self.param.pipeline_run_selector.objects = self.results[
            'pipeline_name'].unique()
    def run(self, config: Dict[Text, Any]):
        """
        This run function essentially calls an underlying TFX orchestrator run.
        However it is meant as a higher level abstraction with some
        opinionated decisions taken.

        Args:
            config: a ZenML config dict
        """
        # Extract the paths to create the tar
        logger.info('Orchestrating pipeline on GCP..')

        repo: Repository = Repository.get_instance()
        repo_path = repo.path
        config_dir = repo.zenml_config.config_dir
        tar_file_name = \
            f'{EXTRACTED_TAR_DIR_NAME}_{str(int(time.time()))}.tar.gz'
        path_to_tar = os.path.join(config_dir, tar_file_name)

        # Create tarfile but exclude .zenml folder if exists
        path_utils.create_tarfile(repo_path, path_to_tar)
        logger.info(f'Created tar of current repository at: {path_to_tar}')

        # Upload tar to artifact store
        store_path = config[keys.GlobalKeys.ARTIFACT_STORE]
        store_staging_area = os.path.join(store_path, STAGING_AREA)
        store_path_to_tar = os.path.join(store_staging_area, tar_file_name)
        path_utils.copy(path_to_tar, store_path_to_tar)
        logger.info(f'Copied tar to artifact store at: {store_path_to_tar}')

        # Remove tar
        path_utils.rm_dir(path_to_tar)
        logger.info(f'Removed tar at: {path_to_tar}')

        # Append path of tar in config orchestrator utils
        config[keys.GlobalKeys.BACKEND][
            keys.BackendKeys.ARGS][TAR_PATH_ARG] = store_path_to_tar

        # Launch the instance
        self.launch_instance(config)
Ejemplo n.º 17
0
from zenml.repo import Repository
from zenml.steps.evaluator import TFMAEvaluator
from zenml.steps.preprocesser import StandardPreprocesser
from zenml.steps.split import RandomSplit
from zenml.steps.trainer import TFFeedForwardTrainer
from zenml.exceptions import AlreadyExistsException

# Define the training pipeline
training_pipeline = TrainingPipeline()

# Add a datasource. This will automatically track and version it.
try:
    ds = CSVDatasource(name='Pima Indians Diabetes',
                       path='gs://zenml_quickstart/diabetes.csv')
except AlreadyExistsException:
    ds = Repository.get_instance().get_datasource_by_name(
        'Pima Indians Diabetes')
training_pipeline.add_datasource(ds)

# Add a split
training_pipeline.add_split(RandomSplit(split_map={'train': 0.7, 'eval': 0.3}))

# Add a preprocessing unit
training_pipeline.add_preprocesser(
    StandardPreprocesser(features=[
        'times_pregnant', 'pgc', 'dbp', 'tst', 'insulin', 'bmi', 'pedigree',
        'age'
    ],
                         labels=['has_diabetes'],
                         overwrite={
                             'has_diabetes': {
                                 'transform': [{
Ejemplo n.º 18
0
 def load_config(self) -> Dict[Text, Any]:
     """Loads a config dict from yaml file."""
     return Repository.get_instance().load_pipeline_config(
         file_name=self.file_name)
Ejemplo n.º 19
0
    def __init__(self,
                 name: Text = None,
                 enable_cache: Optional[bool] = True,
                 steps_dict: Dict[Text, BaseStep] = None,
                 backend: OrchestratorBaseBackend = None,
                 metadata_store: Optional[ZenMLMetadataStore] = None,
                 artifact_store: Optional[ArtifactStore] = None,
                 datasource: Optional[BaseDatasource] = None,
                 pipeline_name: Optional[Text] = None,
                 *args,
                 **kwargs):
        """
        Construct a base pipeline. This is a base interface that is meant
        to be overridden in multiple other pipeline use cases.

        Args:
            name: Outward-facing name of the pipeline.
            pipeline_name: A unique name that identifies the pipeline after
             it is run.
            enable_cache: Boolean, indicates whether or not caching
             should be used.
            steps_dict: Optional dict of steps.
            backend: Orchestrator backend.
            metadata_store: Configured metadata store. If None,
             the default metadata store is used.
            artifact_store: Configured artifact store. If None,
             the default artifact store is used.
        """
        # Generate a name if not given
        if name is None:
            name = str(round(time.time() * 1000))
        self.name = name
        self._immutable = False

        # Metadata store
        if metadata_store:
            self.metadata_store: ZenMLMetadataStore = metadata_store
        else:
            # use default
            self.metadata_store: ZenMLMetadataStore = \
                Repository.get_instance().get_default_metadata_store()

        if pipeline_name:
            # This means its been loaded in through YAML, try to get context
            self.pipeline_name = pipeline_name
            self.file_name = self.pipeline_name + '.yaml'
        else:
            # if pipeline_name is None then its a new pipeline
            self.pipeline_name = self.create_pipeline_name_from_name()
            self.file_name = self.pipeline_name + '.yaml'
            # check duplicates here as its a 'new' pipeline
            self._check_registered()
            track(event=CREATE_PIPELINE)
            logger.info(f'Pipeline {name} created.')

        self.enable_cache = enable_cache

        if steps_dict is None:
            self.steps_dict: Dict[Text, BaseStep] = {}
        else:
            self.steps_dict = steps_dict

        # Default to local
        if backend is None:
            self.backend = OrchestratorBaseBackend()
        else:
            self.backend = backend

        # Artifact store
        if artifact_store:
            self.artifact_store = artifact_store
        else:
            # use default
            self.artifact_store = \
                Repository.get_instance().get_default_artifact_store()

        # Datasource
        if datasource:
            self.datasource = datasource
        else:
            self.datasource = None

        self._source = source_utils.resolve_class(self.__class__)
        self._kwargs = {
            keys.PipelineDetailKeys.NAME: self.pipeline_name,
            keys.PipelineDetailKeys.ENABLE_CACHE: self.enable_cache,
        }
        if kwargs:
            self._kwargs.update(kwargs)
Ejemplo n.º 20
0
 def _check_registered(self):
     if Repository.get_instance().get_pipeline_by_name(
             self.name) is not None:
         raise AlreadyExistsException(name=self.name,
                                      resource_type='pipeline')
Ejemplo n.º 21
0
def set_artifact_store(path: Text = None):
    """Change artifact store for local config."""
    repo: Repository = Repository.get_instance()
    repo.zenml_config.set_artifact_store(path)
    click.echo(f'Default artifact store updated to {path}')
Ejemplo n.º 22
0
def get_artifact_store():
    """Print artifact store from local config."""
    repo: Repository = Repository.get_instance()
    click.echo(f'Default artifact store points to: '
               f'{repo.get_default_artifact_store().path}')
Ejemplo n.º 23
0
from examples.nlp.training.trainer import UrduTrainer
from zenml.datasources import CSVDatasource
from zenml.exceptions import AlreadyExistsException
from zenml.pipelines import NLPPipeline
from zenml.repo import Repository
from zenml.steps.split import RandomSplit
from zenml.steps.tokenizer import HuggingFaceTokenizerStep

nlp_pipeline = NLPPipeline()

try:
    ds = CSVDatasource(name="My Urdu Text",
                       path="gs://zenml_quickstart/urdu_fake_news.csv")
except AlreadyExistsException:
    ds = Repository.get_instance().get_datasource_by_name(name="My Urdu Text")

nlp_pipeline.add_datasource(ds)

tokenizer_step = HuggingFaceTokenizerStep(text_feature="news",
                                          tokenizer="bert-wordpiece",
                                          vocab_size=3000)

nlp_pipeline.add_tokenizer(tokenizer_step=tokenizer_step)

nlp_pipeline.add_split(RandomSplit(split_map={"train": 0.9, "eval": 0.1}))

nlp_pipeline.add_trainer(
    UrduTrainer(model_name="distilbert-base-uncased",
                epochs=3,
                batch_size=64,
Ejemplo n.º 24
0
from zenml.datasources import CSVDatasource
from zenml.pipelines import TrainingPipeline
from zenml.repo import Repository
from zenml.steps.preprocesser import StandardPreprocesser
from zenml.steps.split import CategoricalDomainSplit
from zenml.steps.trainer import TFFeedForwardTrainer
from zenml.utils import path_utils
from zenml.exceptions import AlreadyExistsException
from zenml.logger import get_logger

logger = get_logger(__name__)

# reset pipeline root to redirect to tests so that it writes the yamls there
ZENML_ROOT = str(Path(zenml.__path__[0]).parent)
TEST_ROOT = os.path.join(ZENML_ROOT, "tests")
Repository.init_repo(TEST_ROOT, analytics_opt_in=False)

pipeline_root = os.path.join(TEST_ROOT, "pipelines")
csv_root = os.path.join(TEST_ROOT, "test_data")
image_root = os.path.join(csv_root, "images")


repo: Repository = Repository.get_instance()
if path_utils.is_dir(pipeline_root):
    path_utils.rm_dir(pipeline_root)
repo.zenml_config.set_pipelines_dir(pipeline_root)

try:
    for i in range(1, 6):
        training_pipeline = TrainingPipeline(name='csvtest{0}'.format(i))
Ejemplo n.º 25
0
def set_pipelines_dir(path: Text = None):
    """Change pipelines dir for local config."""
    repo: Repository = Repository.get_instance()
    repo.zenml_config.set_pipelines_dir(path)
    click.echo(f'Default pipelines dir updated to {path}')
Ejemplo n.º 26
0
def test_repo_double_init():
    # explicitly constructing another repository should fail
    with pytest.raises(Exception):
        _ = Repository()
Ejemplo n.º 27
0
def get_pipelines_dir():
    """Print pipelines dir from local config."""
    repo: Repository = Repository.get_instance()
    click.echo(f'Default pipelines dir points to: '
               f'{repo.get_default_pipelines_dir()}')
Ejemplo n.º 28
0
def get_metadata_store():
    """Print metadata store from local config."""
    repo: Repository = Repository.get_instance()
    click.echo(f'Metadata store: '
               f'{repo.get_default_metadata_store().to_config()}')
Ejemplo n.º 29
0
def compare_pipelines(repo: Repository):
    """Compares pipelines in repo"""
    click.echo('Comparing training pipelines in repo: Starting app..')
    repo.compare_training_pipelines()
Ejemplo n.º 30
0
def list_datasources(repo: Repository):
    datasources = repo.get_datasources()

    click.echo(tabulate([ds.to_config() for ds in datasources],
                        headers="keys"))