def load_data(dataset_name, budget): """ Only works with categorical/ordinal columns as of now SQL scenario...? """ # Load dataset from service (dataset is pd.DataFrame) dataset_document = DATASET_CLIENT.read(dataset_name, budget) dataset = load_dataset(dataset_document) schema = load_metadata(dataset_document) # NOTE: As of right now, any data clipping per schema is not # required for the supported synthetic data scenarios # TODO: Support categorical, ordinal and continuous specification through schema categorical_columns = [] ordinal_columns = [] # range(0,len(data)) # TODO: Temporary support for dropping id column if 'pid' in dataset.columns: dataset.drop('pid', axis=1, inplace=True) if 'income' in dataset.columns: dataset.drop('income', axis=1, inplace=True) return dataset, dataset_document, { 'categorical_columns': categorical_columns, 'ordinal_columns': ordinal_columns }, schema
import json import sys import pandas as pd from opendp.whitenoise.client import get_dataset_client from opendp.whitenoise.data.adapters import load_reader, load_metadata from opendp.whitenoise.sql import PrivateReader if __name__ == "__main__": dataset_name = sys.argv[1] budget = float(sys.argv[2]) query = sys.argv[3] with mlflow.start_run(): dataset_document = get_dataset_client().read(dataset_name, budget) reader = load_reader(dataset_document) metadata = load_metadata(dataset_document) budget_per_column = budget / PrivateReader.get_budget_multiplier( metadata, reader, query) private_reader = PrivateReader(metadata, reader, budget_per_column) rowset = private_reader.execute(query) result = {"query_result": rowset} df = pd.DataFrame(rowset[1:], columns=rowset[0]) with open("result.json", "w") as stream: json.dump(df.to_dict(), stream) mlflow.log_artifact("result.json")