def generate_name(filename: str, attempt: int = 1) -> str: """Generates a dataset name from a given filename. Args: filename (str): source filename. attempt (int): the current attempt of generating a new name. Return: str: new generated dataset name. """ # normalize filename to ASCII characters # replace spaces by dashes name = normalize('NFKD', filename) \ .encode('ASCII', 'ignore') \ .replace(b' ', b'-') \ .decode() if attempt > 1: # adds a suffix '-NUMBER' to filename name, extension = splitext(name) name = f"{name}-{attempt}{extension}" try: # check if final_name is already in use stat_dataset(name) except FileNotFoundError: return name # if it is already in use, return generate_name(filename, attempt + 1)
def get_dataset(project_id, experiment_id, operator_id): """Retrieves a dataset as json. Args: project_id (str): the project uuid. experiment_id (str): the experiment uuid. operator_id (str): the operator uuid. """ raise_if_project_does_not_exist(project_id) experiment = Experiment.query.get(experiment_id) if experiment is None: raise NotFound("The specified experiment does not exist") raise_if_operator_does_not_exist(operator_id) try: metadata = platiagro.stat_dataset(name=experiment.dataset, operator_id=operator_id) if "run_id" not in metadata: raise FileNotFoundError() dataset = platiagro.load_dataset(name=experiment.dataset, run_id="latest", operator_id=operator_id) dataset = dataset.to_dict(orient="split") del dataset["index"] except FileNotFoundError as e: raise NotFound(str(e)) return dataset
def list_columns(dataset: str) -> List[Dict[str, str]]: """Lists all columns from a dataset. Args: dataset (str): the dataset name. Returns: A list of columns names and featuretypes. Raises: NotFound: when the dataset does not exist. """ try: metadata = stat_dataset(dataset) columns = metadata.get("columns", []) featuretypes = metadata.get("featuretypes", []) columns = [{ "name": col, "featuretype": ftype } for col, ftype in zip(columns, featuretypes)] return columns except FileNotFoundError: raise NotFound("The specified dataset does not exist")
def list_columns(dataset): """ Lists all columns from a dataset. Parameters ---------- dataset : str The dataset name. Returns ------- list A list of columns names and featuretypes. Raises ------ NotFound When the dataset does not exist. """ try: metadata = stat_dataset(dataset) columns = metadata.get("columns", []) featuretypes = metadata.get("featuretypes", []) columns = [{ "name": col, "featuretype": ftype } for col, ftype in zip(columns, featuretypes)] return columns except FileNotFoundError: raise DATASET_NOT_FOUND
def get_dataset_pagination(project_id, experiment_id, operator_id, page, page_size): """Retrieves a dataset as json. Args: project_id (str): the project uuid. experiment_id (str): the experiment uuid. operator_id (str): the operator uuid. """ raise_if_project_does_not_exist(project_id) raise_if_experiment_does_not_exist(experiment_id) operator = Operator.query.get(operator_id) if operator is None: raise NotFound("The specified operator does not exist") # get dataset name dataset = operator.parameters.get('dataset') if dataset is None: raise NotFound() try: metadata = platiagro.stat_dataset(name=dataset, operator_id=operator_id) if "run_id" not in metadata: raise FileNotFoundError() dataset = platiagro.load_dataset(name=dataset, run_id="latest", operator_id=operator_id) dataset = dataset.to_dict(orient="split") del dataset["index"] except FileNotFoundError as e: raise NotFound(str(e)) return pagination_datasets(page=page, page_size=page_size, elements=dataset)
def test_update_dataset_metadata(self): featuretypes = [ 'Categorical', 'Categorical', 'Categorical', 'Categorical', 'Categorical' ] metadata = stat_dataset("mock.csv") metadata["featuretypes"] = featuretypes update_dataset_metadata("mock.csv", metadata) result = stat_dataset("mock.csv") expected = { "columns": self.mock_columns(), "featuretypes": featuretypes, "filename": "mock.csv", "run_id": RUN_ID, } self.assertDictEqual(result, expected)
def get_dataset(name: str) -> Dict[str, Any]: """Details a dataset from our object storage. Args: name (str): the dataset name to look for in our object storage. Returns: The dataset details: name, columns, and filename. Raises: NotFound: when the dataset does not exist. """ try: metadata = stat_dataset(name) filename = metadata.get("original-filename") if "columns" in metadata and "featuretypes" in metadata: columns = metadata["columns"] featuretypes = metadata["featuretypes"] columns = [{ "name": col, "featuretype": ftype } for col, ftype in zip(columns, featuretypes)] return {"name": name, "columns": columns, "filename": filename} return {"name": name, "filename": filename} except FileNotFoundError: raise NotFound("The specified dataset does not exist")
def get_featuretypes(name): """ Get the dataset featuretypes. Parameters ---------- name : str The dataset name to look for in our object storage. Returns ------- bytes The dataset featuretypes encoded. Raises ------ NotFound When the dataset does not exist. """ try: metadata = stat_dataset(name) except FileNotFoundError: raise NOT_FOUND metadata_featuretypes = metadata.get("featuretypes") featuretypes = "\n".join(metadata_featuretypes) return featuretypes.encode()
def get_dataset(name, page=1, page_size=10): """ Details a dataset from our object storage. Parameters ---------- name : str The dataset name to look for in our object storage. page : int or str The page number. First page is 1. Default to 1. page_size : int or str The page size. Default value is 10. Returns ------- dict The dataset details: name, columns, and filename. Raises ------ NotFound When the dataset does not exist. BadRequest """ try: page, page_size = int(page), int(page_size) metadata = stat_dataset(name) filename = metadata.get("original-filename") dataset = {"name": name, "filename": filename} if "columns" in metadata and "featuretypes" in metadata: columns = metadata["columns"] featuretypes = metadata["featuretypes"] columns = [ {"name": col, "featuretype": ftype} for col, ftype in zip(columns, featuretypes) ] content = load_dataset(name) # Replaces NaN value by a text "NaN" so JSON encode doesn't fail content.replace(np.nan, "NaN", inplace=True, regex=True) content.replace(np.inf, "Inf", inplace=True, regex=True) content.replace(-np.inf, "-Inf", inplace=True, regex=True) data = content.values.tolist() if page_size != -1: data = data_pagination(content=data, page=page, page_size=page_size) dataset.update( {"columns": columns, "data": data, "total": len(content.index)} ) return dataset except FileNotFoundError: raise NOT_FOUND except ValueError: raise BadRequest("ValueError", VALUE_ERROR_MESSAGE)
def update_column(dataset, column, featuretype): """ Updates a column from a dataset. Paramters --------- dataset : str The dataset name. column : str The column name. featuretype : str The feature type (Numerical, Categorical, or DateTime). Returns ------- dict The column info. Raises ------ NotFound When the dataset or column does not exist. BadRequest When the featuretype is invalid. """ try: metadata = stat_dataset(dataset) if "columns" not in metadata or "featuretypes" not in metadata: raise COLUMN_NOT_FOUND columns = metadata["columns"] if column not in columns: raise COLUMN_NOT_FOUND # sets new metadata index = columns.index(column) metadata["featuretypes"][index] = featuretype validate_featuretypes(metadata["featuretypes"]) df = load_dataset(dataset) # uses PlatIAgro SDK to save the dataset save_dataset(dataset, df, metadata=metadata) except FileNotFoundError: raise DATASET_NOT_FOUND except ValueError as e: raise BadRequest("ValueError", str(e)) return {"name": column, "featuretype": featuretype}
def patch_dataset(name, file_object): """ Update the dataset metadata in our object storage. Parameters ---------- name : str The dataset name to look for in our object storage. file_object : dict File object. Returns ------- dict The dataset details: name, columns, and filename. Raises ------ BadRequest When incoming files are missing or invalid. NotFound When the dataset does not exist """ if not file_object.file: raise BadRequest("NoFeatureTypes", "No featuretypes part") try: metadata = stat_dataset(name) except FileNotFoundError: raise NOT_FOUND try: ftype_file = file_object.file featuretypes = list( map(lambda s: s.strip().decode("utf8"), ftype_file.readlines()) ) validate_featuretypes(featuretypes) except ValueError as e: raise BadRequest("ValueError", str(e)) columns = metadata["columns"] if len(columns) != len(featuretypes): raise BadRequest( "DifferentLengths", "featuretypes must be the same length as the DataFrame columns" ) # uses PlatIAgro SDK to update the dataset metadata metadata["featuretypes"] = featuretypes update_dataset_metadata(name=name, metadata=metadata) return get_dataset(name)
def generate_name(filename, attempt=1): """Generates a dataset name from a given filename. Parameters ---------- filename : str Source filename. attempt : int The current attempt of generating a new name. Default to 1. Returns ------- str New generated dataset name. """ # normalize filename to ASCII characters # replace spaces by dashes name = ( normalize("NFKD", filename) .encode("ASCII", "ignore") .replace(b" ", b"-") .decode() ) if attempt > 1: # adds a suffix '-NUMBER' to filename name, extension = splitext(name) name = f"{name}-{attempt}{extension}" try: # check if final_name is already in use stat_dataset(name) except FileNotFoundError: return name # if it is already in use, return generate_name(filename, attempt + 1)
def update_column(dataset: str, column: str, featuretype: str) -> Dict[str, str]: """Updates a column from a dataset. Args: dataset (str): the dataset name. column (str): the column name. featuretype (str): the feature type (Numerical, Categorical, or DateTime). Returns: The column info. Raises: NotFound: when the dataset or column does not exist. BadRequest: when the featuretype is invalid. """ try: metadata = stat_dataset(dataset) if "columns" not in metadata or "featuretypes" not in metadata: raise NotFound("The specified column does not exist") columns = metadata["columns"] if column not in columns: raise NotFound("The specified column does not exist") # sets new metadata index = columns.index(column) metadata["featuretypes"][index] = featuretype validate_featuretypes(metadata["featuretypes"]) df = load_dataset(dataset) # uses PlatIAgro SDK to save the dataset save_dataset(dataset, df, metadata=metadata) except FileNotFoundError: raise NotFound("The specified dataset does not exist") except ValueError as e: raise BadRequest(str(e)) return {"name": column, "featuretype": featuretype}
def get_dataset_pagination(application_csv, name, operator_id, page, page_size, run_id): """Retrieves a dataset. Args: application_csv(bool): if is to return dataset as csv name(str): the dataset name operator_id(str): the operator uuid page_size(int) : record numbers page(int): page number run_id (str): the run id. Returns: Dataset """ try: metadata = platiagro.stat_dataset(name=name, operator_id=operator_id) if "run_id" not in metadata: raise FileNotFoundError() dataset = platiagro.load_dataset(name=name, operator_id=operator_id, run_id=run_id) except FileNotFoundError as e: raise NotFound(str(e)) if page_size == -1: if application_csv: return dataset.to_csv(index=False) dataset = dataset.to_dict(orient="split") del dataset["index"] return dataset else: dataset = dataset.to_dict(orient="split") del dataset["index"] pdataset = pagination_datasets(page=page, page_size=page_size, dataset=dataset) if application_csv: df = pd.DataFrame(columns=pdataset['columns'], data=pdataset['data']) return df.to_csv(index=False) return pdataset
def get_dataset( self, project_id: str, experiment_id: str, run_id: str, operator_id: str, page: Optional[int] = 1, page_size: Optional[int] = 10, accept: Optional[str] = None, ): """ Get dataset records from a run. Supports pagination. Parameters ---------- project_id : str experiment_id : str run_id : str The run_id. If `run_id=latest`, then returns datasets from the latest run_id. operator_id : str page : int The page number. First page is 1. page_size : int The page size. Default value is 10. accept : str Whether dataset should be returned as csv file. Default to None. Returns ------- list A list of dataset records. Raises ------ NotFound When any of project_id, experiment_id, run_id, or operator_id does not exist. """ if run_id == "latest": run_id = get_latest_run_id(experiment_id) name = self.get_dataset_name(operator_id, experiment_id) try: metadata = stat_dataset(name=name, operator_id=operator_id, run_id=run_id) except FileNotFoundError: raise NotFound( code="DatasetNotFound", message="The specified run does not contain dataset", ) dataset = load_dataset( name=name, run_id=run_id, operator_id=operator_id, page=page, page_size=page_size, ) if isinstance(dataset, pd.DataFrame): # Replaces NaN value by a text "NaN" so JSON encode doesn't fail dataset.replace(np.nan, "NaN", inplace=True, regex=True) data = dataset.to_dict(orient="split") total = metadata.get("total", len(dataset.index)) return {"columns": data["columns"], "data": data["data"], "total": total} return StreamingResponse( dataset, media_type="application/octet-stream", )
def test_stat_dataset(self): with self.assertRaises(FileNotFoundError): stat_dataset("UNK") result = stat_dataset("mock.zip") expected = { "filename": "mock.zip", } self.assertDictEqual(result, expected) result = stat_dataset("/tmp/data/mock.zip") expected = { "filename": "mock.zip", } self.assertDictEqual(result, expected) result = stat_dataset("mock.csv") expected = { "columns": self.mock_columns(), "featuretypes": self.mock_featuretypes(), "filename": "mock.csv", "run_id": RUN_ID, } self.assertDictEqual(result, expected) result = stat_dataset("mock.csv", run_id="latest", operator_id=OPERATOR_ID) expected = { "columns": self.mock_columns(), "featuretypes": self.mock_featuretypes(), "filename": "mock.csv", "run_id": RUN_ID, } self.assertDictEqual(result, expected) result = stat_dataset("mock.csv", run_id=RUN_ID, operator_id=OPERATOR_ID) expected = { "columns": self.mock_columns(), "featuretypes": self.mock_featuretypes(), "filename": "mock.csv", "run_id": RUN_ID, } self.assertDictEqual(result, expected) os.environ["RUN_ID"] = RUN_ID result = stat_dataset("mock.csv") expected = { "columns": self.mock_columns(), "featuretypes": self.mock_featuretypes(), "filename": "mock.csv", "run_id": RUN_ID, } self.assertDictEqual(result, expected) result = stat_dataset("mock.csv", operator_id=OPERATOR_ID) expected = { "columns": self.mock_columns(), "featuretypes": self.mock_featuretypes(), "filename": "mock.csv", "run_id": RUN_ID, } self.assertDictEqual(result, expected) run_id = "THIS_RUN_ID_DOES_NOT_EXIST" with self.assertRaises(FileNotFoundError): stat_dataset("mock.csv", run_id=run_id)