def test_sklearn_query(): sklearn_dataset = sklearn.datasets.load_iris() sklearn_df = pd.DataFrame(data=sklearn_dataset.data, columns=sklearn_dataset.feature_names) iris = Table("dbo", "iris", 150, [ Float("sepal length (cm)", 4, 8), Float("sepal width (cm)", 2, 5), Float("petal length (cm)", 1, 7), Float("petal width (cm)", 0, 3) ]) schema = CollectionMetadata([iris], "csv") reader = PandasReader(schema, sklearn_df) rowset = execute_private_query( schema, reader, 0.3, 'SELECT AVG("petal width (cm)") FROM dbo.iris') df = pd.DataFrame(rowset[1:], columns=rowset[0]) assert df is not None assert len(df) == 1
import pandas as pd from pandasql import sqldf import math from opendp.whitenoise.metadata import CollectionMetadata from opendp.whitenoise.sql import PrivateReader, PandasReader from opendp.whitenoise.sql.parse import QueryParser from opendp.whitenoise.reader.rowset import TypedRowset git_root_dir = subprocess.check_output("git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip() meta_path = os.path.join(git_root_dir, os.path.join("service", "datasets", "PUMS.yaml")) csv_path = os.path.join(git_root_dir, os.path.join("service", "datasets", "PUMS.csv")) schema = CollectionMetadata.from_file(meta_path) df = pd.read_csv(csv_path) # Unit tests # class TestQuery: def test_count_exact(self): reader = PandasReader(schema, df) rs = reader.execute("SELECT COUNT(*) AS c FROM PUMS.PUMS") assert(rs[1][0] == 1000) def test_empty_result(self): reader = PandasReader(schema, df) rs = reader.execute("SELECT age as a FROM PUMS.PUMS WHERE age > 100") assert(len(rs) == 1) def test_empty_result_typed(self): reader = PandasReader(schema, df)
validate_files = [ join(testpath + "validate/", f) for f in listdir(testpath + "validate") if isfile(join(testpath + "validate", f)) ] good_files = [f for f in validate_files if not "_fail" in f] bad_files = [f for f in validate_files if "_fail" in f] for d in other_dirs: other_files = [ join(testpath + d + "/", f) for f in listdir(testpath + d) if isfile(join(testpath + d, f)) ] good_files.extend(other_files) metadata = CollectionMetadata.from_file(join(dir_name, "Devices.yaml")) # # Unit tests # class TestValidate: def test_all_good_queries(self): for goodpath in good_files: print(goodpath) gqt = GoodQueryTester(goodpath) gqt.runValidate() def test_all_bad_queries(self): for badpath in bad_files: bqt = BadQueryTester(badpath)
iris_dataset_path = os.path.join(root_url, "service", "datasets", "iris.csv") if not os.path.exists(iris_dataset_path): sklearn_dataset = sklearn.datasets.load_iris() sklearn_df = pd.DataFrame(data=sklearn_dataset.data, columns=sklearn_dataset.feature_names) sklearn_df.to_csv(iris_dataset_path) iris_schema_path = os.path.join(root_url, "service", "datasets", "iris.yaml") if not os.path.exists(iris_schema_path): iris = Table("iris", "iris", 150, [ Float("sepal length (cm)", 4, 8), Float("sepal width (cm)", 2, 5), Float("petal length (cm)", 1, 7), Float("petal width (cm)", 0, 3) ]) schema = CollectionMetadata([iris], "csv") schema.to_file(iris_schema_path, "iris") def find_ngrams(input_list, n): return input_list if n == 1 else list(zip(*[input_list[i:] for i in range(n)])) def _download_file(url, local_file): try: from urllib import urlretrieve except ImportError: from urllib.request import urlretrieve urlretrieve(url, local_file) pums_1000_dataset_path = os.path.join(root_url, "service", "datasets", "evaluation", "PUMS_1000.csv") if not os.path.exists(pums_1000_dataset_path): pums_url = "https://raw.githubusercontent.com/opendifferentialprivacy/dp-test-datasets/master/data/PUMS_california_demographics_1000/data.csv"
import pandas as pd from opendp.whitenoise.sql import PandasReader, PrivateReader from opendp.whitenoise.metadata import CollectionMetadata pums = pd.read_csv('PUMS.csv') meta = CollectionMetadata.from_file('PUMS.yaml') query = 'SELECT married, AVG(income) AS income, COUNT(*) AS n FROM PUMS.PUMS GROUP BY married' query = 'SELECT COUNT(*) AS n, COUNT(pid) AS foo FROM PUMS.PUMS WHERE age > 80 GROUP BY educ' reader = PandasReader(meta, pums) private_reader = PrivateReader(meta, reader, 4.0) private_reader.options.censor_dims = True private_reader.options.clamp_counts = True exact = reader.execute_typed(query) print(exact) private = private_reader.execute_typed(query) print(private)
def _load_metadata(dataset_document): return CollectionMetadata.from_file( dataset_document.dataverse_details.local_metadata_path)