コード例 #1
0
def load_data(dataset_name, budget):
    """
    Only works with categorical/ordinal columns as of now
    
    SQL scenario...?
    """
    # Load dataset from service (dataset is pd.DataFrame)
    dataset_document = DATASET_CLIENT.read(dataset_name, budget)
    dataset = load_dataset(dataset_document)
    schema = load_metadata(dataset_document)

    # NOTE: As of right now, any data clipping per schema is not
    # required for the supported synthetic data scenarios

    # TODO: Support categorical, ordinal and continuous specification through schema
    categorical_columns = []
    ordinal_columns = []  # range(0,len(data))

    # TODO: Temporary support for dropping id column
    if 'pid' in dataset.columns:
        dataset.drop('pid', axis=1, inplace=True)
    if 'income' in dataset.columns:
        dataset.drop('income', axis=1, inplace=True)

    return dataset, dataset_document, {
        'categorical_columns': categorical_columns,
        'ordinal_columns': ordinal_columns
    }, schema
コード例 #2
0
import json
import sys

import pandas as pd

from opendp.whitenoise.client import get_dataset_client
from opendp.whitenoise.data.adapters import load_reader, load_metadata
from opendp.whitenoise.sql import PrivateReader

if __name__ == "__main__":
    dataset_name = sys.argv[1]
    budget = float(sys.argv[2])
    query = sys.argv[3]

    with mlflow.start_run():
        dataset_document = get_dataset_client().read(dataset_name, budget)
        reader = load_reader(dataset_document)
        metadata = load_metadata(dataset_document)

        budget_per_column = budget / PrivateReader.get_budget_multiplier(
            metadata, reader, query)
        private_reader = PrivateReader(metadata, reader, budget_per_column)

        rowset = private_reader.execute(query)
        result = {"query_result": rowset}
        df = pd.DataFrame(rowset[1:], columns=rowset[0])

        with open("result.json", "w") as stream:
            json.dump(df.to_dict(), stream)
        mlflow.log_artifact("result.json")