Exemple #1
0
    def sync(self, project_name, s3_root_dir):
        """Sync a local project to S3.

        Parameters
        ----------
        project_name : str
            The name of the project to sync.
        s3_root_dir : str
            The S3 path where the project's data
            will be synced.

        Notes
        -----
        Use to backup your local project data to S3, as an alternative to direct S3 logging.
        Relies on AWS CLI's sync. Ensure that your credentials are set and that your Proxy
        is on.
        """
        if self.config.persistence != "filesystem":
            raise RubiconException(
                "You can't sync projects written to memory. Sync from either local filesystem or S3."
            )

        project = self.get_project(project_name)
        local_path = f"{self.config.root_dir}/{slugify(project.name)}"
        cmd = f"aws s3 sync {local_path} {s3_root_dir}/{slugify(project.name)}"

        try:
            subprocess.run(cmd, shell=True, check=True, capture_output=True)
        except subprocess.CalledProcessError as e:
            raise RubiconException(e.stderr)
Exemple #2
0
    def _validate_data(self, data_bytes, data_file, data_path, name):
        """Raises a `RubiconException` if the data to log as
        an artifact is improperly provided.
        """
        if not any([data_bytes, data_file, data_path]):
            raise RubiconException(
                "One of `data_bytes`, `data_file` or `data_path` must be provided."
            )

        if name is None:
            if data_path is not None:
                name = os.path.basename(data_path)
            else:
                raise RubiconException(
                    "`name` must be provided if not using `data_path`.")

        if data_bytes is None:
            if data_file is not None:
                f = data_file
            elif data_path is not None:
                f = fsspec.open(data_path, "rb")

            with f as open_file:
                data_bytes = open_file.read()

        return data_bytes, name
Exemple #3
0
    def plot(self, df_type="pandas", **kwargs):
        """Render the dataframe using `hvplot`.

        Parameters
        ----------
        df_type : str, optional
            The type of dataframe. Can be either `pandas` or `dask`.
        kwargs : dict
            Additional keyword arguments to be passed along to the
            `hvplot` function.

        Notes
        -----
        For usage, visit: https://hvplot.holoviz.org/user_guide/Plotting.html
        For customizations, visit:
        https://hvplot.holoviz.org/user_guide/Customization.html

        Examples
        --------
        >>> # Log a line plot
        >>> dataframe.plot(kind='line', x='Year', y='Number of Subscriptions')
        """
        try:
            if df_type == "pandas":
                import hvplot.pandas  # noqa F401
            else:
                import hvplot.dask  # noqa F401
        except ImportError:
            raise RubiconException(
                "`hvplot` is required for plotting. Install with `pip install hvplot`."
            )

        return self.get_data(df_type=df_type).hvplot(**kwargs)
Exemple #4
0
    async def get_artifact_metadata(self, project_name, artifact_id, experiment_id=None):
        """Overrides `rubicon.repository.BaseRepository.get_artifact_metadata`
        to asynchronously retrieve an artifact's metadata from the configured
        filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the artifact with ID
            `artifact_id` is logged to.
        artifact_id : str
            The ID of the artifact to retrieve.
        experiment_id : str, optional
            The ID of the experiment the artifact with ID
            `artifact_id` is logged to. Artifacts do not
            need to belong to an experiment.

        Returns
        -------
        rubicon.domain.Artifact
            The artifact with ID `artifact_id`.
        """
        artifact_metadata_path = self._get_artifact_metadata_path(
            project_name, experiment_id, artifact_id
        )

        try:
            artifact = json.loads(await self.filesystem._cat_file(artifact_metadata_path))
        except FileNotFoundError:
            raise RubiconException(f"No artifact with id `{artifact_id}` found.")

        return domain.Artifact(**artifact)
Exemple #5
0
    async def get_dataframe_metadata(self, project_name, dataframe_id, experiment_id=None):
        """Overrides `rubicon.repository.BaseRepository.get_dataframe_metadata`
        to asynchronously retrieve a dataframes's metadata from the configured
        filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the dataframe with ID
            `dataframe_id` is logged to.
        dataframe_id : str
            The ID of the dataframe to retrieve.
        experiment_id : str, optional
            The ID of the experiment the dataframe with ID
            `dataframe_id` is logged to. Dataframes do not
            need to belong to an experiment.

        Returns
        -------
        rubicon.domain.Dataframe
            The dataframe with ID `dataframe_id`.
        """
        dataframe_metadata_path = self._get_dataframe_metadata_path(
            project_name, experiment_id, dataframe_id
        )

        try:
            dataframe = json.loads(await self.filesystem._cat_file(dataframe_metadata_path))
        except FileNotFoundError:
            raise RubiconException(f"No dataframe with id `{dataframe_id}` found.")

        return domain.Dataframe(**dataframe)
Exemple #6
0
    def get_artifact_metadata(self, project_name, artifact_id, experiment_id=None):
        """Retrieve an artifact's metadata from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the artifact with ID
            `artifact_id` is logged to.
        artifact_id : str
            The ID of the artifact to retrieve.
        experiment_id : str, optional
            The ID of the experiment the artifact with ID
            `artifact_id` is logged to. Artifacts do not
            need to belong to an experiment.

        Returns
        -------
        rubicon.domain.Artifact
            The artifact with ID `artifact_id`.
        """
        artifact_metadata_path = self._get_artifact_metadata_path(
            project_name, experiment_id, artifact_id
        )

        try:
            open_file = self.filesystem.open(artifact_metadata_path)
        except FileNotFoundError:
            raise RubiconException(f"No artifact with id `{artifact_id}` found.")

        with open_file as f:
            artifact = json.load(f)

        return domain.Artifact(**artifact)
Exemple #7
0
    def get_parameter(self, project_name, experiment_id, parameter_name):
        """Retrieve a parameter from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project this parameter belongs to.
        experiment_id : str
            The ID of the experiment the parameter with name
            `parameter_name` is logged to.
        parameter_name : str
            The name of the parameter to retrieve.

        Returns
        -------
        rubicon.domain.Parameter
            The parameter with name `parameter_name`.
        """
        parameter_metadata_path = self._get_parameter_metadata_path(
            project_name, experiment_id, parameter_name)

        try:
            open_file = self.filesystem.open(parameter_metadata_path)
        except FileNotFoundError:
            raise RubiconException(
                f"No parameter with name '{parameter_name}' found.")

        with open_file as f:
            parameter = json.load(f)

        return domain.Parameter(**parameter)
Exemple #8
0
    async def get_artifact_data(self,
                                project_name,
                                artifact_id,
                                experiment_id=None):
        """Overrides `rubicon.repository.BaseRepository.get_artifact_data`
        to asynchronously retrieve an artifact's raw data.

        Parameters
        ----------
        project_name : str
            The name of the project the artifact with ID
            `artifact_id` is logged to.
        artifact_id : str
            The ID of the artifact to retrieve data from.
        experiment_id : str, optional
            The ID of the experiment the artifact with ID
            `artifact_id` is logged to. Artifacts do not
            need to belong to an experiment.

        Returns
        -------
        bytes
            The artifact with ID `artifact_id`'s raw data.
        """
        artifact_data_path = self._get_artifact_data_path(
            project_name, experiment_id, artifact_id)

        try:
            data = await self.filesystem._cat_file(artifact_data_path)
        except FileNotFoundError:
            raise RubiconException(
                f"No data for artifact with id `{artifact_id}` found.")

        return data
Exemple #9
0
    async def delete_artifact(self,
                              project_name,
                              artifact_id,
                              experiment_id=None):
        """Overrides `rubicon.repository.BaseRepository.delete_artifact` to
        asynchronously delete an artifact from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the artifact with ID
            `artifact_id` is logged to.
        artifact_id : str
            The ID of the artifact to delete.
        experiment_id : str, optional
            The ID of the experiment the artifact with ID
            `artifact_id` is logged to. Artifacts do not
            need to belong to an experiment.
        """
        artifact_metadata_root = self._get_artifact_metadata_root(
            project_name, experiment_id)

        try:
            self.filesystem.rm(f"{artifact_metadata_root}/{artifact_id}",
                               recursive=True)
        except FileNotFoundError:
            raise RubiconException(
                f"No artifact with id `{artifact_id}` found.")
Exemple #10
0
    def get_dataframe_metadata(self, project_name, dataframe_id, experiment_id=None):
        """Retrieve a dataframes's metadata from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the dataframe with ID
            `dataframe_id` is logged to.
        dataframe_id : str
            The ID of the dataframe to retrieve.
        experiment_id : str, optional
            The ID of the experiment the dataframe with ID
            `dataframe_id` is logged to. Dataframes do not
            need to belong to an experiment.

        Returns
        -------
        rubicon.domain.Dataframe
            The dataframe with ID `dataframe_id`.
        """
        dataframe_metadata_path = self._get_dataframe_metadata_path(
            project_name, experiment_id, dataframe_id
        )

        try:
            open_file = self.filesystem.open(dataframe_metadata_path)
        except FileNotFoundError:
            raise RubiconException(f"No dataframe with id `{dataframe_id}` found.")

        with open_file as f:
            dataframe = json.load(f)

        return domain.Dataframe(**dataframe)
Exemple #11
0
    async def get_metric(self, project_name, experiment_id, metric_name):
        """Overrides `rubicon.repository.BaseRepository.get_metric` to
        asynchronously retrieve a metric from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project this metric belongs to.
        experiment_id : str
            The ID of the experiment the metric with name
            `metric_name` is logged to.
        metric_name : str
            The name of the metric to retrieve.

        Returns
        -------
        rubicon.domain.Metric
            The metric with name `metric_name`.
        """
        metric_metadata_path = self._get_metric_metadata_path(
            project_name, experiment_id, metric_name)

        try:
            metric = json.loads(
                await self.filesystem._cat_file(metric_metadata_path))
        except FileNotFoundError:
            raise RubiconException(
                f"No metric with name '{metric_name}' found.")

        return domain.Metric(**metric)
Exemple #12
0
    async def get_experiment(self, project_name, experiment_id):
        """Overrides `rubicon.repository.BaseRepository.get_experiment` to
        asynchronously retrieve an experiment from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the experiment with ID
            `experiment_id` is logged to.
        experiment_id : str
            The ID of the experiment to retrieve.

        Returns
        -------
        rubicon.domain.Experiment
            The experiment with ID `experiment_id`.
        """
        experiment_metadata_path = self._get_experiment_metadata_path(
            project_name, experiment_id)

        try:
            experiment = json.loads(
                await self.filesystem._cat_file(experiment_metadata_path))
        except FileNotFoundError:
            raise RubiconException(
                f"No experiment with id `{experiment_id}` found.")

        return domain.Experiment(**experiment)
Exemple #13
0
    async def get_feature(self, project_name, experiment_id, feature_name):
        """Overrides `rubicon.repository.BaseRepository.get_feature` to
        asynchronously retrieve a feature from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the experiment with ID
            `experiment_id` is logged to.
        experiment_id : str
            The ID of the experiment the feature with name
            `feature_name` is logged to.
        feature_name : str
            The name of the feature to retrieve.

        Returns
        -------
        rubicon.domain.Feature
            The feature with name `feature_name`.
        """
        feature_metadata_path = self._get_feature_metadata_path(
            project_name, experiment_id, feature_name)

        try:
            feature = json.loads(
                await self.filesystem._cat_file(feature_metadata_path))
        except FileNotFoundError:
            raise RubiconException(
                f"No feature with name '{feature_name}' found.")

        return domain.Feature(**feature)
Exemple #14
0
    def get_artifact_data(self, project_name, artifact_id, experiment_id=None):
        """Retrieve an artifact's raw data.

        Parameters
        ----------
        project_name : str
            The name of the project the artifact with ID
            `artifact_id` is logged to.
        artifact_id : str
            The ID of the artifact to retrieve data from.
        experiment_id : str, optional
            The ID of the experiment the artifact with ID
            `artifact_id` is logged to. Artifacts do not
            need to belong to an experiment.

        Returns
        -------
        bytes
            The artifact with ID `artifact_id`'s raw data.
        """
        artifact_data_path = self._get_artifact_data_path(
            project_name, experiment_id, artifact_id)

        try:
            open_file = self.filesystem.open(artifact_data_path, "rb")
        except FileNotFoundError:
            raise RubiconException(
                f"No data for artifact with id `{artifact_id}` found.")

        return open_file.read()
Exemple #15
0
    def get_experiment(self, project_name, experiment_id):
        """Retrieve an experiment from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the experiment with ID
            `experiment_id` is logged to.
        experiment_id : str
            The ID of the experiment to retrieve.

        Returns
        -------
        rubicon.domain.Experiment
            The experiment with ID `experiment_id`.
        """
        experiment_metadata_path = self._get_experiment_metadata_path(
            project_name, experiment_id)

        try:
            open_file = self.filesystem.open(experiment_metadata_path)
        except FileNotFoundError:
            raise RubiconException(
                f"No experiment with id `{experiment_id}` found.")

        with open_file as f:
            experiment = json.load(f)

        return domain.Experiment(**experiment)
Exemple #16
0
 def _check_is_in_git_repo(self):
     """Raise a `RubiconException` if not called from within a `git` repository."""
     if subprocess.run(["git", "rev-parse", "--git-dir"],
                       capture_output=True).returncode != 0:
         raise RubiconException(
             "Not a `git` repo: Falied to locate the '.git' directory in this or any parent directories."
         )
Exemple #17
0
    def get_feature(self, project_name, experiment_id, feature_name):
        """Retrieve a feature from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the experiment with ID
            `experiment_id` is logged to.
        experiment_id : str
            The ID of the experiment the feature with name
            `feature_name` is logged to.
        feature_name : str
            The name of the feature to retrieve.

        Returns
        -------
        rubicon.domain.Feature
            The feature with name `feature_name`.
        """
        feature_metadata_path = self._get_feature_metadata_path(
            project_name, experiment_id, feature_name)

        try:
            open_file = self.filesystem.open(feature_metadata_path)
        except FileNotFoundError:
            raise RubiconException(
                f"No feature with name '{feature_name}' found.")

        with open_file as f:
            feature = json.load(f)

        return domain.Feature(**feature)
Exemple #18
0
    def plot(self, **kwargs):
        """Render the dataframe using `hvplot`.

        Parameters
        ----------
        kwargs : dict
            Additional keyword arguments to be passed along to the
            `hvplot` function.

        Notes
        -----
        For usage, visit: https://hvplot.holoviz.org/user_guide/Plotting.html
        For customizations, visit:
        https://hvplot.holoviz.org/user_guide/Customization.html

        Examples
        --------
        >>> # Log a line plot
        >>> dataframe.plot(kind='line', x='Year', y='Number of Subscriptions')
        """
        try:
            # data is a dask dataframe
            import hvplot.dask  # noqa F401
        except ImportError:
            raise RubiconException(
                "`hvplot` is required for plotting. Install with `pip install hvplot`."
            )

        return self.data.hvplot(**kwargs)
Exemple #19
0
    def get_metric(self, project_name, experiment_id, metric_name):
        """Retrieve a metric from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the experiment with ID
            `experiment_id` is logged to.
        experiment_id : str
            The ID of the experiment the metric with name
            `metric_name` is logged to.
        metric_name : str
            The name of the metric to retrieve.

        Returns
        -------
        rubicon.domain.Metric
            The metric with name `metric_name`.
        """
        metric_metadata_path = self._get_metric_metadata_path(
            project_name, experiment_id, metric_name)

        try:
            open_file = self.filesystem.open(metric_metadata_path)
        except FileNotFoundError:
            raise RubiconException(
                f"No metric with name '{metric_name}' found.")

        with open_file as f:
            metric = json.load(f)

        return domain.Metric(**metric)
Exemple #20
0
    def get_dataframe_data(self, project_name, dataframe_id, experiment_id=None, df_type="pandas"):
        """Retrieve a dataframe's raw data.

        Parameters
        ----------
        project_name : str
            The name of the project the dataframe with ID
            `dataframe_id` is logged to.
        dataframe_id : str
            The ID of the dataframe to retrieve data from.
        experiment_id : str, optional
            The ID of the experiment the dataframe with ID
            `artifact_id` is logged to. Dataframes do not
            need to belong to an experiment.
        df_type : str, optional
            The type of dataframe. Can be either `pandas` or `dask`.

        Returns
        -------
        dask.dataframe.DataFrame
            The dataframe with ID `dataframe_id`'s raw data.
        """
        dataframe_data_path = self._get_dataframe_data_path(
            project_name, experiment_id, dataframe_id
        )

        try:
            df = self._read_dataframe(dataframe_data_path, df_type)
        except FileNotFoundError:
            raise RubiconException(
                f"No data for dataframe with id `{dataframe_id}` found. This might have "
                "happened if you forgot to set `df_type='dask'` when trying to read a `dask` dataframe."
            )

        return df
Exemple #21
0
    def get_dataframe_data(self,
                           project_name,
                           dataframe_id,
                           experiment_id=None):
        """Retrieve a dataframe's raw data.

        Parameters
        ----------
        project_name : str
            The name of the project the dataframe with ID
            `dataframe_id` is logged to.
        dataframe_id : str
            The ID of the dataframe to retrieve data from.
        experiment_id : str, optional
            The ID of the experiment the dataframe with ID
            `artifact_id` is logged to. Dataframes do not
            need to belong to an experiment.

        Returns
        -------
        dask.dataframe.DataFrame
            The dataframe with ID `dataframe_id`'s raw data.
        """
        dataframe_data_path = self._get_dataframe_data_path(
            project_name, experiment_id, dataframe_id)

        try:
            df = self._read_dataframe(dataframe_data_path)
        except FileNotFoundError:
            raise RubiconException(
                f"No data for dataframe with id `{dataframe_id}` found.")

        return df
Exemple #22
0
    async def get_parameter(self, project_name, experiment_id, parameter_name):
        """Overrides `rubicon.repository.BaseRepository.get_parameter` to
        asynchronously retrieve a parameter from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project this parameter belongs to.
        experiment_id : str
            The ID of the experiment the parameter with name
            `parameter_name` is logged to.
        parameter_name : str
            The name of the parameter to retrieve.

        Returns
        -------
        rubicon.domain.Parameter
            The parameter with name `parameter_name`.
        """
        parameter_metadata_path = self._get_parameter_metadata_path(
            project_name, experiment_id, parameter_name)

        try:
            parameter = json.loads(
                await self.filesystem._cat_file(parameter_metadata_path))
        except FileNotFoundError:
            raise RubiconException(
                f"No parameter with name '{parameter_name}' found.")

        return domain.Parameter(**parameter)
Exemple #23
0
    def __init__(self, training_metadata):
        if not isinstance(training_metadata, list):
            training_metadata = [training_metadata]

        if not all([isinstance(tm, tuple) for tm in training_metadata]):
            raise RubiconException(
                "`training_metadata` must be a list of tuples.")

        self.training_metadata = training_metadata
Exemple #24
0
    def create_project(self, project):
        """Persist a project to the configured filesystem.

        Parameters
        ----------
        project : rubicon.domain.Project
            The project to persist.
        """
        project_metadata_path = self._get_project_metadata_path(project.name)

        if self.filesystem.exists(project_metadata_path):
            raise RubiconException(f"A project with name '{project.name}' already exists.")

        self._persist_domain(project, project_metadata_path)
Exemple #25
0
    def _read_dataframe(self, path, df_type="pandas"):
        """Reads the dataframe `df` from the configured filesystem."""
        df = None
        acceptable_types = ["pandas", "dask"]
        if df_type not in acceptable_types:
            raise RubiconException(f"`df_type` must be one of {acceptable_types}")

        if df_type == "pandas":
            path = f"{path}/data.parquet"
            df = pd.read_parquet(path, engine="pyarrow")
        else:
            df = dd.read_parquet(path, engine="pyarrow")

        return df
Exemple #26
0
    def _get_repository(self):
        """Get the repository for the configured persistence type."""
        protocol = self._get_protocol()

        repository_key = f"{self.persistence}-{protocol}"
        repository = self.REPOSITORIES.get(repository_key)

        if repository is None:
            raise RubiconException(
                f"{self.__class__.__module__}.{self.__class__.__name__} has no persistence "
                +
                f"layer for the provided configuration: `persistence`: {self.persistence}, "
                + f"`protocol` (from `root_dir`): {protocol}")

        return repository(self.root_dir, **self.storage_options)
Exemple #27
0
    async def create_project(self, project):
        """Overrides `rubicon.repository.BaseRepository.create_project`
        to asynchronously persist a project to the configured filesystem.

        Parameters
        ----------
        project : rubicon.domain.Project
            The project to persist.
        """
        project_metadata_path = self._get_project_metadata_path(project.name)

        if await self.filesystem._exists(project_metadata_path):
            raise RubiconException(f"A project with name '{project.name}' already exists.")

        await self._persist_domain(project, project_metadata_path)
Exemple #28
0
    def __init__(
        self,
        estimator=None,
        experiment=None,
        step_name=None,
        select=[],
        ignore=[],
        ignore_all=False,
    ):
        if ignore and select:
            raise RubiconException("provide either `select` OR `ignore`, not both")

        self.ignore = ignore
        self.ignore_all = ignore_all
        self.select = select

        super().__init__(estimator=estimator, experiment=experiment, step_name=step_name)
Exemple #29
0
    def create_parameter(self, parameter, project_name, experiment_id):
        """Persist a parameter to the configured filesystem.

        Parameters
        ----------
        parameter : rubicon.domain.Parameter
            The parameter to persist.
        project_name : str
            The name of the project the experiment with ID
            `experiment_id` is logged to.
        experiment_id : str
            The ID of the experiment this parameter belongs to.
        """
        parameter_metadata_path = self._get_parameter_metadata_path(
            project_name, experiment_id, parameter.name)

        if self.filesystem.exists(parameter_metadata_path):
            raise RubiconException(
                f"A parameter with name '{parameter.name}' already exists.")

        self._persist_domain(parameter, parameter_metadata_path)
Exemple #30
0
    def create_metric(self, metric, project_name, experiment_id):
        """Persist a metric to the configured filesystem.

        Parameters
        ----------
        metric : rubicon.domain.Metric
            The metric to persist.
        project_name : str
            The name of the project the experiment with ID
            `experiment_id` is logged to.
        experiment_id : str
            The ID of the experiment this metric belongs to.
        """
        metric_metadata_path = self._get_metric_metadata_path(
            project_name, experiment_id, metric.name)

        if self.filesystem.exists(metric_metadata_path):
            raise RubiconException(
                f"A metric with name '{metric.name}' already exists.")

        self._persist_domain(metric, metric_metadata_path)