Python Dataframe Examples, rubicon.domain.Dataframe Python Examples

Example #1

0

Show file

    async def log_dataframe(self, df, description=None, tags=[]):
        """Overrides `rubicon.client.DataframeMixin.log_dataframe` to
        asynchronously log a dataframe to this client object.

        Parameters
        ----------
        df : pandas.DataFrame or dask.dataframe.DataFrame
            The `dask` or `pandas` dataframe to log.
        description : str, optional
            The dataframe's description. Use to provide
            additional context.
        tags : list of str
            The values to tag the dataframe with.

        Returns
        -------
        rubicon.client.Dataframe
            The new dataframe.
        """
        dataframe = domain.Dataframe(parent_id=self._domain.id, description=description, tags=tags)

        project_name, experiment_id = self._get_parent_identifiers()
        await self.repository.create_dataframe(
            dataframe, df, project_name, experiment_id=experiment_id
        )

        return client.Dataframe(dataframe, self)

Example #2

0

Show file

    def get_dataframes_metadata(self, project_name, experiment_id=None):
        """Retrieve all dataframes' metadata from the configured
        filesystem that belong to the specified object.

        Parameters
        ----------
        project_name : str
            The name of the project to retrieve all dataframes
            from.
        experiment_id : str, optional
            The ID of the experiment to retrieve all dataframes
            from. Dataframes do not need to belong to an
            experiment.

        Returns
        -------
        list of rubicon.domain.Dataframe
            The dataframes logged to the specified object.
        """
        dataframe_metadata_root = self._get_dataframe_metadata_root(
            project_name, experiment_id)

        try:
            dataframe_metadata_paths = self._ls_directories_only(
                dataframe_metadata_root)
            dataframes = [
                domain.Dataframe(**json.loads(data)) for data in
                self.filesystem.cat(dataframe_metadata_paths).values()
            ]
        except FileNotFoundError:
            return []

        return dataframes

Example #3

0

Show file

    def get_dataframe_metadata(self, project_name, dataframe_id, experiment_id=None):
        """Retrieve a dataframes's metadata from the configured filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the dataframe with ID
            `dataframe_id` is logged to.
        dataframe_id : str
            The ID of the dataframe to retrieve.
        experiment_id : str, optional
            The ID of the experiment the dataframe with ID
            `dataframe_id` is logged to. Dataframes do not
            need to belong to an experiment.

        Returns
        -------
        rubicon.domain.Dataframe
            The dataframe with ID `dataframe_id`.
        """
        dataframe_metadata_path = self._get_dataframe_metadata_path(
            project_name, experiment_id, dataframe_id
        )

        try:
            open_file = self.filesystem.open(dataframe_metadata_path)
        except FileNotFoundError:
            raise RubiconException(f"No dataframe with id `{dataframe_id}` found.")

        with open_file as f:
            dataframe = json.load(f)

        return domain.Dataframe(**dataframe)

Example #4

0

Show file

    async def get_dataframes_metadata(self, project_name, experiment_id=None):
        """Overrides `rubicon.repository.BaseRepository.get_dataframes_metadata`
        to asynchronously retrieve all dataframes' metadata from the configured
        filesystem that belong to the specified object.

        Parameters
        ----------
        project_name : str
            The name of the project to retrieve all dataframes
            from.
        experiment_id : str, optional
            The ID of the experiment to retrieve all dataframes
            from. Dataframes do not need to belong to an
            experiment.

        Returns
        -------
        list of rubicon.domain.Dataframe
            The dataframes logged to the specified object.
        """
        dataframe_metadata_root = self._get_dataframe_metadata_root(project_name, experiment_id)

        try:
            dataframe_metadata_paths = await self._ls_directories_only(dataframe_metadata_root)
            dataframes = [
                domain.Dataframe(**json.loads(data))
                for data in await asyncio.gather(
                    *[self.filesystem._cat_file(path) for path in dataframe_metadata_paths]
                )
            ]
        except FileNotFoundError:
            return []

        return dataframes

Example #5

0

Show file

    async def get_dataframe_metadata(self, project_name, dataframe_id, experiment_id=None):
        """Overrides `rubicon.repository.BaseRepository.get_dataframe_metadata`
        to asynchronously retrieve a dataframes's metadata from the configured
        filesystem.

        Parameters
        ----------
        project_name : str
            The name of the project the dataframe with ID
            `dataframe_id` is logged to.
        dataframe_id : str
            The ID of the dataframe to retrieve.
        experiment_id : str, optional
            The ID of the experiment the dataframe with ID
            `dataframe_id` is logged to. Dataframes do not
            need to belong to an experiment.

        Returns
        -------
        rubicon.domain.Dataframe
            The dataframe with ID `dataframe_id`.
        """
        dataframe_metadata_path = self._get_dataframe_metadata_path(
            project_name, experiment_id, dataframe_id
        )

        try:
            dataframe = json.loads(await self.filesystem._cat_file(dataframe_metadata_path))
        except FileNotFoundError:
            raise RubiconException(f"No dataframe with id `{dataframe_id}` found.")

        return domain.Dataframe(**dataframe)

Example #6

0

Show file

def _create_dataframe(repository, project=None, dataframe_data=None):
    if project is None:
        project = _create_project(repository)

    if dataframe_data is None:
        dataframe_data = pd.DataFrame([[0, 1], [1, 0]], columns=["a", "b"])

    dataframe = domain.Dataframe(parent_id=project.id)
    repository.create_dataframe(dataframe, dataframe_data, project.name)

    return dataframe

Example #7

0

Show file

def test_properties(project_client):
    parent = project_client

    domain_dataframe = domain.Dataframe(description="some description",
                                        tags=["x"])
    dataframe = Dataframe(domain_dataframe, parent)

    assert dataframe.id == domain_dataframe.id
    assert dataframe.description == "some description"
    assert dataframe.tags == domain_dataframe.tags
    assert dataframe.created_at == domain_dataframe.created_at
    assert dataframe.parent == parent

Example #8

0

Show file

File: test_base_repo.py Project: mbseid/rubicon

def _create_dask_dataframe(repository, project=None):
    if project is None:
        project = _create_project(repository)

    df = pd.DataFrame([[0, 1, "a"], [1, 1, "b"], [2, 2, "c"], [3, 2, "d"]],
                      columns=["a", "b", "c"])
    ddf = dd.from_pandas(df, npartitions=1)

    dataframe = domain.Dataframe(parent_id=project.id)
    repository.create_dataframe(dataframe, ddf, project.name)

    return dataframe

Example #9

0

Show file

File: test_asyn_mixin_client.py Project: mbseid/rubicon

def test_delete_dataframes(asyn_client_w_mock_repo):
    rubicon = asyn_client_w_mock_repo

    project_name = f"Test Project {uuid.uuid4()}"
    project = asyncio.run(rubicon.create_project(project_name))
    dataframe_domains = [domain.Dataframe(parent_id=project.id) for _ in range(0, 3)]

    dataframe_ids = [d.id for d in dataframe_domains]
    asyncio.run(DataframeMixin.delete_dataframes(project, dataframe_ids))

    expected = [
        call.delete_dataframe(project.name, dataframe_id, experiment_id=None)
        for dataframe_id in dataframe_ids
    ]

    assert rubicon.repository.mock_calls[1:] == expected

Example #10

0

Show file

def test_get_dataframe_tags_with_experiment_parent_root(memory_repository):
    repository = memory_repository
    experiment = _create_experiment(repository)
    dataframe = domain.Dataframe(parent_id=experiment.id)
    dataframe_data = pd.DataFrame([[0, 1], [1, 0]], columns=["a", "b"])

    repository.create_dataframe(dataframe, dataframe_data,
                                experiment.project_name, experiment.id)

    dataframe_tags_root = repository._get_tag_metadata_root(
        experiment.project_name,
        experiment_id=experiment.id,
        dataframe_id=dataframe.id)

    assert (dataframe_tags_root ==
            f"{repository.root_dir}/{slugify(experiment.project_name)}/" +
            f"experiments/{experiment.id}/dataframes/{dataframe.id}")

Example #11

0

Show file

File: test_base_repo.py Project: mbseid/rubicon

def _create_pandas_dataframe(repository,
                             project=None,
                             dataframe_data=None,
                             multi_index=False):
    if project is None:
        project = _create_project(repository)

    if dataframe_data is None:
        dataframe_data = pd.DataFrame(
            [[0, 1, "a"], [1, 1, "b"], [2, 2, "c"], [3, 2, "d"]],
            columns=["a", "b", "c"])
        if multi_index:
            dataframe_data = dataframe_data.set_index(["b",
                                                       "a"])  # Set multiindex

    dataframe = domain.Dataframe(parent_id=project.id)
    repository.create_dataframe(dataframe, dataframe_data, project.name)

    return dataframe

Example #12

0

Show file

File: test_asyn_mixin_client.py Project: mbseid/rubicon

def test_get_dataframes(asyn_client_w_mock_repo):
    rubicon = asyn_client_w_mock_repo

    project_name = f"Test Project {uuid.uuid4()}"
    project = asyncio.run(rubicon.create_project(project_name))
    dataframe_domains = [domain.Dataframe(parent_id=project.id) for _ in range(0, 3)]

    rubicon.repository.get_dataframes_metadata.return_value = dataframe_domains

    dataframes = asyncio.run(DataframeMixin.dataframes(project))

    expected = [call.get_dataframes_metadata(project.name, experiment_id=None)]

    dataframe_ids = [d.id for d in dataframes]
    for dataframe_id in [d.id for d in dataframe_domains]:
        assert dataframe_id in dataframe_ids
        dataframe_ids.remove(dataframe_id)

    assert len(dataframe_ids) == 0
    assert rubicon.repository.mock_calls[1:] == expected

Example #13

0

Show file

def _create_dataframe_domain(project=None, tags=[]):
    if project is None:
        project = domain.Project(f"Test Project {uuid.uuid4()}")

    return project, domain.Dataframe(parent_id=project.id)