async def log_dataframe(self, df, description=None, tags=[]): """Overrides `rubicon.client.DataframeMixin.log_dataframe` to asynchronously log a dataframe to this client object. Parameters ---------- df : pandas.DataFrame or dask.dataframe.DataFrame The `dask` or `pandas` dataframe to log. description : str, optional The dataframe's description. Use to provide additional context. tags : list of str The values to tag the dataframe with. Returns ------- rubicon.client.Dataframe The new dataframe. """ dataframe = domain.Dataframe(parent_id=self._domain.id, description=description, tags=tags) project_name, experiment_id = self._get_parent_identifiers() await self.repository.create_dataframe( dataframe, df, project_name, experiment_id=experiment_id ) return client.Dataframe(dataframe, self)
def get_dataframes_metadata(self, project_name, experiment_id=None): """Retrieve all dataframes' metadata from the configured filesystem that belong to the specified object. Parameters ---------- project_name : str The name of the project to retrieve all dataframes from. experiment_id : str, optional The ID of the experiment to retrieve all dataframes from. Dataframes do not need to belong to an experiment. Returns ------- list of rubicon.domain.Dataframe The dataframes logged to the specified object. """ dataframe_metadata_root = self._get_dataframe_metadata_root( project_name, experiment_id) try: dataframe_metadata_paths = self._ls_directories_only( dataframe_metadata_root) dataframes = [ domain.Dataframe(**json.loads(data)) for data in self.filesystem.cat(dataframe_metadata_paths).values() ] except FileNotFoundError: return [] return dataframes
def get_dataframe_metadata(self, project_name, dataframe_id, experiment_id=None): """Retrieve a dataframes's metadata from the configured filesystem. Parameters ---------- project_name : str The name of the project the dataframe with ID `dataframe_id` is logged to. dataframe_id : str The ID of the dataframe to retrieve. experiment_id : str, optional The ID of the experiment the dataframe with ID `dataframe_id` is logged to. Dataframes do not need to belong to an experiment. Returns ------- rubicon.domain.Dataframe The dataframe with ID `dataframe_id`. """ dataframe_metadata_path = self._get_dataframe_metadata_path( project_name, experiment_id, dataframe_id ) try: open_file = self.filesystem.open(dataframe_metadata_path) except FileNotFoundError: raise RubiconException(f"No dataframe with id `{dataframe_id}` found.") with open_file as f: dataframe = json.load(f) return domain.Dataframe(**dataframe)
async def get_dataframes_metadata(self, project_name, experiment_id=None): """Overrides `rubicon.repository.BaseRepository.get_dataframes_metadata` to asynchronously retrieve all dataframes' metadata from the configured filesystem that belong to the specified object. Parameters ---------- project_name : str The name of the project to retrieve all dataframes from. experiment_id : str, optional The ID of the experiment to retrieve all dataframes from. Dataframes do not need to belong to an experiment. Returns ------- list of rubicon.domain.Dataframe The dataframes logged to the specified object. """ dataframe_metadata_root = self._get_dataframe_metadata_root(project_name, experiment_id) try: dataframe_metadata_paths = await self._ls_directories_only(dataframe_metadata_root) dataframes = [ domain.Dataframe(**json.loads(data)) for data in await asyncio.gather( *[self.filesystem._cat_file(path) for path in dataframe_metadata_paths] ) ] except FileNotFoundError: return [] return dataframes
async def get_dataframe_metadata(self, project_name, dataframe_id, experiment_id=None): """Overrides `rubicon.repository.BaseRepository.get_dataframe_metadata` to asynchronously retrieve a dataframes's metadata from the configured filesystem. Parameters ---------- project_name : str The name of the project the dataframe with ID `dataframe_id` is logged to. dataframe_id : str The ID of the dataframe to retrieve. experiment_id : str, optional The ID of the experiment the dataframe with ID `dataframe_id` is logged to. Dataframes do not need to belong to an experiment. Returns ------- rubicon.domain.Dataframe The dataframe with ID `dataframe_id`. """ dataframe_metadata_path = self._get_dataframe_metadata_path( project_name, experiment_id, dataframe_id ) try: dataframe = json.loads(await self.filesystem._cat_file(dataframe_metadata_path)) except FileNotFoundError: raise RubiconException(f"No dataframe with id `{dataframe_id}` found.") return domain.Dataframe(**dataframe)
def _create_dataframe(repository, project=None, dataframe_data=None): if project is None: project = _create_project(repository) if dataframe_data is None: dataframe_data = pd.DataFrame([[0, 1], [1, 0]], columns=["a", "b"]) dataframe = domain.Dataframe(parent_id=project.id) repository.create_dataframe(dataframe, dataframe_data, project.name) return dataframe
def test_properties(project_client): parent = project_client domain_dataframe = domain.Dataframe(description="some description", tags=["x"]) dataframe = Dataframe(domain_dataframe, parent) assert dataframe.id == domain_dataframe.id assert dataframe.description == "some description" assert dataframe.tags == domain_dataframe.tags assert dataframe.created_at == domain_dataframe.created_at assert dataframe.parent == parent
def _create_dask_dataframe(repository, project=None): if project is None: project = _create_project(repository) df = pd.DataFrame([[0, 1, "a"], [1, 1, "b"], [2, 2, "c"], [3, 2, "d"]], columns=["a", "b", "c"]) ddf = dd.from_pandas(df, npartitions=1) dataframe = domain.Dataframe(parent_id=project.id) repository.create_dataframe(dataframe, ddf, project.name) return dataframe
def test_delete_dataframes(asyn_client_w_mock_repo): rubicon = asyn_client_w_mock_repo project_name = f"Test Project {uuid.uuid4()}" project = asyncio.run(rubicon.create_project(project_name)) dataframe_domains = [domain.Dataframe(parent_id=project.id) for _ in range(0, 3)] dataframe_ids = [d.id for d in dataframe_domains] asyncio.run(DataframeMixin.delete_dataframes(project, dataframe_ids)) expected = [ call.delete_dataframe(project.name, dataframe_id, experiment_id=None) for dataframe_id in dataframe_ids ] assert rubicon.repository.mock_calls[1:] == expected
def test_get_dataframe_tags_with_experiment_parent_root(memory_repository): repository = memory_repository experiment = _create_experiment(repository) dataframe = domain.Dataframe(parent_id=experiment.id) dataframe_data = pd.DataFrame([[0, 1], [1, 0]], columns=["a", "b"]) repository.create_dataframe(dataframe, dataframe_data, experiment.project_name, experiment.id) dataframe_tags_root = repository._get_tag_metadata_root( experiment.project_name, experiment_id=experiment.id, dataframe_id=dataframe.id) assert (dataframe_tags_root == f"{repository.root_dir}/{slugify(experiment.project_name)}/" + f"experiments/{experiment.id}/dataframes/{dataframe.id}")
def _create_pandas_dataframe(repository, project=None, dataframe_data=None, multi_index=False): if project is None: project = _create_project(repository) if dataframe_data is None: dataframe_data = pd.DataFrame( [[0, 1, "a"], [1, 1, "b"], [2, 2, "c"], [3, 2, "d"]], columns=["a", "b", "c"]) if multi_index: dataframe_data = dataframe_data.set_index(["b", "a"]) # Set multiindex dataframe = domain.Dataframe(parent_id=project.id) repository.create_dataframe(dataframe, dataframe_data, project.name) return dataframe
def test_get_dataframes(asyn_client_w_mock_repo): rubicon = asyn_client_w_mock_repo project_name = f"Test Project {uuid.uuid4()}" project = asyncio.run(rubicon.create_project(project_name)) dataframe_domains = [domain.Dataframe(parent_id=project.id) for _ in range(0, 3)] rubicon.repository.get_dataframes_metadata.return_value = dataframe_domains dataframes = asyncio.run(DataframeMixin.dataframes(project)) expected = [call.get_dataframes_metadata(project.name, experiment_id=None)] dataframe_ids = [d.id for d in dataframes] for dataframe_id in [d.id for d in dataframe_domains]: assert dataframe_id in dataframe_ids dataframe_ids.remove(dataframe_id) assert len(dataframe_ids) == 0 assert rubicon.repository.mock_calls[1:] == expected
def _create_dataframe_domain(project=None, tags=[]): if project is None: project = domain.Project(f"Test Project {uuid.uuid4()}") return project, domain.Dataframe(parent_id=project.id)