Beispiel #1
0
def load_data(dataset_name, budget):
    """
    Only works with categorical/ordinal columns as of now

    SQL scenario...?
    """
    # Load dataset from service (dataset is pd.DataFrame)
    dataset_document = DATASET_CLIENT.read(dataset_name, budget)
    dataset = load_dataset(dataset_document)
    schema = load_metadata(dataset_document)

    # NOTE: As of right now, any data clipping per schema is not
    # required for the supported synthetic data scenarios

    # TODO: Support categorical, ordinal and continuous specification through schema
    categorical_columns = []
    ordinal_columns = []  # range(0,len(data))

    # TODO: Temporary support for dropping id column
    if 'pid' in dataset.columns:
        dataset.drop('pid', axis=1, inplace=True)
    if 'income' in dataset.columns:
        dataset.drop('income', axis=1, inplace=True)

    return dataset, dataset_document, {
        'categorical_columns': categorical_columns,
        'ordinal_columns': ordinal_columns
    }, schema
def test_read_release_no_penalty(dataset_client, dataset_name):
    """
    READ (RELEASE) TEST
    Further readrelease calls do not incur budget
    """
    dataset_client.custom_headers = {'client_guid': 'mock_user_guid'}
    dataset_document = dataset_client.read_released(dataset_name)
    df = load_dataset(dataset_document)
    assert isinstance(df, pd.pandas.core.frame.DataFrame)
def test_auth_register_release_csv(dataset):
    """
    Tests registering with authorized guids, check successful reads for them
    + unsuccessful reads otherwise

    Check that subsequent reads don't incur budget
    """
    # Generate some valid/invalid clients
    external_clients = [get_dataset_client() for _ in range(0, 9)]
    valid_clients = external_clients[5:9]
    invalid_clients = external_clients[0:5]

    valid_guids = [c._guid for c in valid_clients]

    service_client = get_dataset_client()
    dataset["authorized_users"] = valid_guids

    # Register dataset takes an optional list of valid ids 5-9
    response = service_client.register(dataset)
    assert response.result == dataset['dataset_name']

    # Fake DP Perturb, similar release will be done in module
    retrieved_dataset = service_client.read(dataset["dataset_name"], 1.0)
    retrieved_dataset.dataset_name = "release_authorize_csv"
    release_doc = {
        "dataset_name": "release_authorize_csv",
        "dataset_type": retrieved_dataset.dataset_type,
        retrieved_dataset.dataset_type: retrieved_dataset.csv_details,
        "release_cost": retrieved_dataset.release_cost,
        "budget": retrieved_dataset.budget,
        "authorized_users": retrieved_dataset.authorized_users
    }
    release_dataset = service_client.release(release_doc)
    assert release_dataset.dataset_name == "release_authorize_csv"

    # Should have same authorized users
    assert release_dataset.authorized_users == valid_guids

    # Attempt to read from released dataset with valid clients
    for c in valid_clients:
        dataset_document = c.read_released(release_dataset.dataset_name)
        df = load_dataset(dataset_document)
        assert isinstance(df, pd.pandas.core.frame.DataFrame)

    # Attempt to read from released dataset with invalid clients
    for c in invalid_clients:
        with pytest.raises(Exception) as error:
            c.read_released(release_dataset.dataset_name)

        assert error.typename == "HttpOperationError"
Beispiel #4
0
"""
Sample Command line:

python register_dataset.py private_csv csv_details 10.0 False local_path=serverside/path/to/example.csv
"""

if __name__ == "__main__":
    private_dataset_name = sys.argv[1]
    release_dataset_name = sys.argv[2]
    budget = sys.argv[3]

    with mlflow.start_run():
        service_client = get_dataset_client()
        dataset_document = service_client.read(private_dataset_name, budget)

        input_dataframe = load_dataset(dataset_document)
        new_dataframe = pd.DataFrame(
            input_dataframe)  # DP stuff here? "make_new_df"?

        prev_path = dataset_document['csv_details']['path']
        new_path = os.path.join(os.path.dirname(prev_path),
                                release_dataset_name + '.csv')

        prev_schema = dataset_document['csv_details']['schema']
        new_schema = os.path.join(os.path.dirname(prev_schema),
                                  release_dataset_name + '.yaml')

        new_dataframe.to_csv(new_path, index=False)
        with open(prev_schema, 'w') as yaml_path:
            yaml.dump(prev_schema, yaml_path, default_flow_style=False)
Beispiel #5
0
    dataset_name = sys.argv[1]  # string
    budget = float(sys.argv[2])  # any number

    # We expect the next two inputs in the following format: json.dumps([col, names, here]) (e.g. '["a","b"]')
    x_features = json.loads(sys.argv[3])  # features
    y_targets = json.loads(sys.argv[4])  # class labels

    with mlflow.start_run(run_name="diffpriv_logreg"):
        # log attributes
        mlflow.log_param("dataset_name", dataset_name)
        mlflow.log_param("budget", budget)
        mlflow.log_param("x_features", x_features)
        mlflow.log_param("y_targets", y_targets)

        dataset_document = get_dataset_client().read(dataset_name, budget)
        dataset = load_dataset(dataset_document)
        schema = load_metadata(dataset_document)

        # use column names to get X and y from dataset to pass to LogisticRegression
        X = dataset[x_features]
        y = np.ravel(
            dataset[y_targets]
        )  # use ravel to convert the column vector to a 1d array to avoid issues later using fit

        # TODO Calculate max norm with schema
        norms = np.linalg.norm(dataset, axis=1)  # norms for each column
        max_norm = np.amax(norms)
        logging.warning(
            'Currently calculating the data norm instead of using schema-specified value. This is bad practice, and will eventually be changed'
        )  # provide a warning about bad practice
def test_load_dataverse_dataset_file(dataset_client, dataset_name, budget):
    dataset_document = dataset_client.read(dataset_name, budget)
    df = load_dataset(dataset_document)
    assert isinstance(df, pd.pandas.core.frame.DataFrame)