Ejemplo n.º 1
0
def test_auth_register_release_csv(dataset):
    """
    Tests registering with authorized guids, check successful reads for them
    + unsuccessful reads otherwise

    Check that subsequent reads don't incur budget
    """
    # Generate some valid/invalid clients
    external_clients = [get_dataset_client() for _ in range(0, 9)]
    valid_clients = external_clients[5:9]
    invalid_clients = external_clients[0:5]

    valid_guids = [c._guid for c in valid_clients]

    service_client = get_dataset_client()
    dataset["authorized_users"] = valid_guids

    # Register dataset takes an optional list of valid ids 5-9
    response = service_client.register(dataset)
    assert response.result == dataset['dataset_name']

    # Fake DP Perturb, similar release will be done in module
    retrieved_dataset = service_client.read(dataset["dataset_name"], 1.0)
    retrieved_dataset.dataset_name = "release_authorize_csv"
    release_doc = {
        "dataset_name": "release_authorize_csv",
        "dataset_type": retrieved_dataset.dataset_type,
        retrieved_dataset.dataset_type: retrieved_dataset.csv_details,
        "release_cost": retrieved_dataset.release_cost,
        "budget": retrieved_dataset.budget,
        "authorized_users": retrieved_dataset.authorized_users
    }
    release_dataset = service_client.release(release_doc)
    assert release_dataset.dataset_name == "release_authorize_csv"

    # Should have same authorized users
    assert release_dataset.authorized_users == valid_guids

    # Attempt to read from released dataset with valid clients
    for c in valid_clients:
        dataset_document = c.read_released(release_dataset.dataset_name)
        df = load_dataset(dataset_document)
        assert isinstance(df, pd.pandas.core.frame.DataFrame)

    # Attempt to read from released dataset with invalid clients
    for c in invalid_clients:
        with pytest.raises(Exception) as error:
            c.read_released(release_dataset.dataset_name)

        assert error.typename == "HttpOperationError"
Ejemplo n.º 2
0
def test_register_dataverse(dataset_client, dataset):
    """
    REGISTER TEST
    Checks that a service owner can register a new dataset (dataverse remote)
    """
    response = get_dataset_client().register(dataset)
    assert response.result == dataset['dataset_name']
Ejemplo n.º 3
0
import json
import sys

import pandas as pd

from opendp.whitenoise.client import get_dataset_client
from opendp.whitenoise.data.adapters import load_reader, load_metadata
from opendp.whitenoise.sql import PrivateReader

if __name__ == "__main__":
    dataset_name = sys.argv[1]
    budget = float(sys.argv[2])
    query = sys.argv[3]

    with mlflow.start_run():
        dataset_document = get_dataset_client().read(dataset_name, budget)
        reader = load_reader(dataset_document)
        metadata = load_metadata(dataset_document)

        budget_per_column = budget / PrivateReader.get_budget_multiplier(
            metadata, reader, query)
        private_reader = PrivateReader(metadata, reader, budget_per_column)

        rowset = private_reader.execute(query)
        result = {"query_result": rowset}
        df = pd.DataFrame(rowset[1:], columns=rowset[0])

        with open("result.json", "w") as stream:
            json.dump(df.to_dict(), stream)
        mlflow.log_artifact("result.json")
Ejemplo n.º 4
0
import pandas as pd
from opendp.whitenoise.client import get_dataset_client
from opendp.whitenoise.data.adapters import load_reader, load_metadata, load_dataset
from opendp.whitenoise.sql.private_reader import PrivateReader
from pandasql import sqldf

from sdgym.constants import CONTINUOUS
from sdgym.synthesizers.utils import Transformer

from mwem.mwem import MWEMSynthesizer

# List of supported DP synthesizers
SUPPORTED_SYNTHESIZERS = {'MWEMSynthesizer': MWEMSynthesizer}

# Maintain a dataset client
DATASET_CLIENT = get_dataset_client()


def load_data(dataset_name, budget):
    """
    Only works with categorical/ordinal columns as of now
    
    SQL scenario...?
    """
    # Load dataset from service (dataset is pd.DataFrame)
    dataset_document = DATASET_CLIENT.read(dataset_name, budget)
    dataset = load_dataset(dataset_document)
    schema = load_metadata(dataset_document)

    # NOTE: As of right now, any data clipping per schema is not
    # required for the supported synthetic data scenarios
Ejemplo n.º 5
0
def dataset_client(client):
    return get_dataset_client()
Ejemplo n.º 6
0
from opendp.whitenoise.sql.private_reader import PrivateReader
from pandasql import sqldf

"""
Sample Command line:

python register_dataset.py private_csv csv_details 10.0 False local_path=serverside/path/to/example.csv
"""

if __name__ == "__main__":
    private_dataset_name = sys.argv[1]
    release_dataset_name = sys.argv[2]
    budget = sys.argv[3]
    
    with mlflow.start_run():
        service_client = get_dataset_client()
        dataset_document = service_client.read(private_dataset_name, budget)

        input_dataframe = load_dataset(dataset_document)
        new_dataframe = pd.DataFrame(input_dataframe) # DP stuff here? "make_new_df"?

        prev_path = dataset_document['csv_details']['path']
        new_path = os.path.join(os.path.dirname(prev_path), release_dataset_name + '.csv')
        
        prev_schema = dataset_document['csv_details']['schema']
        new_schema = os.path.join(os.path.dirname(prev_schema), release_dataset_name + '.yaml')

        new_dataframe.to_csv(new_path, index=False)
        with open(prev_schema, 'w') as yaml_path:
            yaml.dump(prev_schema, yaml_path, default_flow_style=False)