def test_auth_register_release_csv(dataset): """ Tests registering with authorized guids, check successful reads for them + unsuccessful reads otherwise Check that subsequent reads don't incur budget """ # Generate some valid/invalid clients external_clients = [get_dataset_client() for _ in range(0, 9)] valid_clients = external_clients[5:9] invalid_clients = external_clients[0:5] valid_guids = [c._guid for c in valid_clients] service_client = get_dataset_client() dataset["authorized_users"] = valid_guids # Register dataset takes an optional list of valid ids 5-9 response = service_client.register(dataset) assert response.result == dataset['dataset_name'] # Fake DP Perturb, similar release will be done in module retrieved_dataset = service_client.read(dataset["dataset_name"], 1.0) retrieved_dataset.dataset_name = "release_authorize_csv" release_doc = { "dataset_name": "release_authorize_csv", "dataset_type": retrieved_dataset.dataset_type, retrieved_dataset.dataset_type: retrieved_dataset.csv_details, "release_cost": retrieved_dataset.release_cost, "budget": retrieved_dataset.budget, "authorized_users": retrieved_dataset.authorized_users } release_dataset = service_client.release(release_doc) assert release_dataset.dataset_name == "release_authorize_csv" # Should have same authorized users assert release_dataset.authorized_users == valid_guids # Attempt to read from released dataset with valid clients for c in valid_clients: dataset_document = c.read_released(release_dataset.dataset_name) df = load_dataset(dataset_document) assert isinstance(df, pd.pandas.core.frame.DataFrame) # Attempt to read from released dataset with invalid clients for c in invalid_clients: with pytest.raises(Exception) as error: c.read_released(release_dataset.dataset_name) assert error.typename == "HttpOperationError"
def test_register_dataverse(dataset_client, dataset): """ REGISTER TEST Checks that a service owner can register a new dataset (dataverse remote) """ response = get_dataset_client().register(dataset) assert response.result == dataset['dataset_name']
import json import sys import pandas as pd from opendp.whitenoise.client import get_dataset_client from opendp.whitenoise.data.adapters import load_reader, load_metadata from opendp.whitenoise.sql import PrivateReader if __name__ == "__main__": dataset_name = sys.argv[1] budget = float(sys.argv[2]) query = sys.argv[3] with mlflow.start_run(): dataset_document = get_dataset_client().read(dataset_name, budget) reader = load_reader(dataset_document) metadata = load_metadata(dataset_document) budget_per_column = budget / PrivateReader.get_budget_multiplier( metadata, reader, query) private_reader = PrivateReader(metadata, reader, budget_per_column) rowset = private_reader.execute(query) result = {"query_result": rowset} df = pd.DataFrame(rowset[1:], columns=rowset[0]) with open("result.json", "w") as stream: json.dump(df.to_dict(), stream) mlflow.log_artifact("result.json")
import pandas as pd from opendp.whitenoise.client import get_dataset_client from opendp.whitenoise.data.adapters import load_reader, load_metadata, load_dataset from opendp.whitenoise.sql.private_reader import PrivateReader from pandasql import sqldf from sdgym.constants import CONTINUOUS from sdgym.synthesizers.utils import Transformer from mwem.mwem import MWEMSynthesizer # List of supported DP synthesizers SUPPORTED_SYNTHESIZERS = {'MWEMSynthesizer': MWEMSynthesizer} # Maintain a dataset client DATASET_CLIENT = get_dataset_client() def load_data(dataset_name, budget): """ Only works with categorical/ordinal columns as of now SQL scenario...? """ # Load dataset from service (dataset is pd.DataFrame) dataset_document = DATASET_CLIENT.read(dataset_name, budget) dataset = load_dataset(dataset_document) schema = load_metadata(dataset_document) # NOTE: As of right now, any data clipping per schema is not # required for the supported synthetic data scenarios
def dataset_client(client): return get_dataset_client()
from opendp.whitenoise.sql.private_reader import PrivateReader from pandasql import sqldf """ Sample Command line: python register_dataset.py private_csv csv_details 10.0 False local_path=serverside/path/to/example.csv """ if __name__ == "__main__": private_dataset_name = sys.argv[1] release_dataset_name = sys.argv[2] budget = sys.argv[3] with mlflow.start_run(): service_client = get_dataset_client() dataset_document = service_client.read(private_dataset_name, budget) input_dataframe = load_dataset(dataset_document) new_dataframe = pd.DataFrame(input_dataframe) # DP stuff here? "make_new_df"? prev_path = dataset_document['csv_details']['path'] new_path = os.path.join(os.path.dirname(prev_path), release_dataset_name + '.csv') prev_schema = dataset_document['csv_details']['schema'] new_schema = os.path.join(os.path.dirname(prev_schema), release_dataset_name + '.yaml') new_dataframe.to_csv(new_path, index=False) with open(prev_schema, 'w') as yaml_path: yaml.dump(prev_schema, yaml_path, default_flow_style=False)