Beispiel #1
0
 def __init__(self):
     from snsql.metadata import Metadata
     self.metadata = {
         'PUMS': Metadata.from_file(pums_schema_path),
         'PUMS_large': Metadata.from_file(pums_large_schema_path),
         'PUMS_pid': Metadata.from_file(pums_pid_schema_path),
         'PUMS_dup': Metadata.from_file(pums_dup_schema_path),
         'PUMS_null': Metadata.from_file(pums_dup_schema_path)
     }
     self.engines = {}
     home = os.path.expanduser("~")
     p = os.path.join(home, ".smartnoise", "connections-unit.yaml")
     if not os.environ.get('SKIP_PANDAS'):
         self.engines['pandas'] = DbEngine('pandas')
     else:
         print("Skipping pandas database tests")
     if os.environ.get('TEST_SPARK'):
         self.engines['spark'] = DbEngine('spark')
     else:
         print("TEST_SPARK not set, so skipping Spark tests")
     if not os.path.exists(p):
         print ("No config file at ~/.smartnoise/connections-unit.yaml")
     else:
         with open(p, 'r') as stream:
             conns = yaml.safe_load(stream)
         if conns is None:
             print("List of installed test engines is empty")
         else:
             for engine in conns:
                 eng = conns[engine]
                 host = conns[engine]["host"]
                 port = conns[engine]["port"]
                 user = conns[engine]["user"]
                 databases = eng['databases']
                 self.engines[engine] = DbEngine(engine, user, host, port, databases)
Beispiel #2
0
 def setup_class(cls):
     meta = Metadata.from_file(meta_path)
     meta["PUMS.PUMS"].censor_dims = False
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader, meta, privacy=Privacy(epsilon=10.0, delta=0.1))
     cls.reader = private_reader
Beispiel #3
0
    def __init__(
        self,
        reader,
        metadata,
        privacy=None
    ):
        """Create a new private reader.  Do not use the constructor directly;
            use the from_connection factory method.

            :param metadata: The Metadata object with information about all tables referenced in this query
            :param reader: The data reader to wrap, such as a SqlServerReader, PandasReader, or SparkReader
                The PrivateReader intercepts queries to the underlying reader and ensures differential privacy.
            :param epsilon_per_column: The privacy budget to spend for each column in the query (deprecated)
            :param delta: The delta privacy parameter (deprecated)
            :param privacy: Pass epsilon and delta
        """
        if isinstance(reader, Reader):
            self.reader = reader
        else:
            raise ValueError("Parameter reader must be of type Reader")
        self.metadata = Metadata.from_(metadata)
        self.rewriter = Rewriter(metadata)
        self._options = PrivateReaderOptions()

        if privacy:
            self.privacy = privacy
        else:
            raise ValueError("Must pass in a Privacy object with privacy parameters.")
        
        self.odometer = OdometerHeterogeneous(self.privacy)

        self._refresh_options()
Beispiel #4
0
    def __init__(self, query, metadata):
        self.query = query

        if metadata:
            self.metadata = Metadata.from_(metadata)
        else:
            self.metadata = metadata
def load_metadata():
    table_features = {
        "Store": {
            "name": "Store",
            "type": "int",
            "private_id": True
        },
        "Date": {
            "name": "Date",
            "type": "datetime"
        },
        "Temperature": {
            "name": "Temperature",
            "type": "float"
        },
        "Fuel_Price": {
            "name": "Fuel_Price",
            "type": "float"
        },
        "IsHoliday": {
            "name": "IsHoliday",
            "type": "boolean"
        },
        "row_privacy": False,
    }

    table_sales = {
        "Store": {
            "name": "Store",
            "type": "int",
            "private_id": True
        },
        "Date": {
            "name": "Date",
            "type": "datetime"
        },
        "Weekly_Sales": {
            "name": "Weekly_Sales",
            "type": "float"
        },
        "IsHoliday": {
            "name": "IsHoliday",
            "type": "boolean"
        },
        "row_privacy": False,
    }

    metadata_dict = {
        "": {
            "": {
                "features": table_features,
                "sales": table_sales,
            }
        },
    }
    return Metadata.from_dict(metadata_dict)
 def test_pandas(self):
     engine = "pandas"
     meta = Metadata.from_file(meta_path)
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     assert(isinstance(reader, SqlReader))
     assert(isinstance(reader, PandasReader))
     assert(isinstance(reader.compare, NameCompare))
     assert(isinstance(reader.compare, PandasNameCompare))
     assert(isinstance(reader.serializer, Serializer))
     assert(isinstance(reader.serializer, PandasSerializer))
Beispiel #7
0
 def test_with_censor_dims(self):
     meta = Metadata.from_file(meta_path)
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader,
                                    meta,
                                    privacy=Privacy(epsilon=3.0))
     query = "SELECT COUNT (*) AS foo, COUNT(DISTINCT pid) AS bar FROM PUMS.PUMS"
     q = QueryParser(meta).query(query)
     inner, outer = private_reader._rewrite_ast(q)
     ne = outer.select.namedExpressions
     assert (ne[0].expression.expression.name != 'keycount')
     assert (ne[1].expression.expression.name == 'keycount')
Beispiel #8
0
 def setup_class(self):
     meta = Metadata.from_file(meta_path)
     meta["PUMS.PUMS"].censor_dims = False
     meta["PUMS.PUMS"]["sex"].type = "int"
     meta["PUMS.PUMS"]["educ"].type = "int"
     meta["PUMS.PUMS"]["married"].type = "bool"
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader,
                                    meta,
                                    privacy=Privacy(epsilon=10.0,
                                                    delta=10e-3))
     self.reader = private_reader
Beispiel #9
0
    def queries(self, query_string, metadata=None):
        if metadata is None and self.metadata is not None:
            metadata = self.metadata
        elif metadata:
            metadata = Metadata.from_(metadata)

        istream = InputStream(query_string)
        parser = self.start_parser(istream)
        bv = BatchVisitor()
        queries = [q for q in bv.visit(parser.batch()).queries]
        if metadata is not None:
            for q in queries:
                q.load_symbols(metadata)
        return queries
Beispiel #10
0
 def test_case_sensitive(self):
     sample = Table(
         "PUMS", "PUMS",
         [Int('pid', is_key=True), Int('"PiD"')], 150)
     meta = Metadata([sample], "csv")
     reader = PostgresReader("localhost", "PUMS", "admin", "password")
     private_reader = PrivateReader(reader,
                                    meta,
                                    privacy=Privacy(epsilon=3.0))
     query = 'SELECT COUNT (DISTINCT pid) AS foo, COUNT(DISTINCT "PiD") AS bar FROM PUMS.PUMS'
     inner, outer = private_reader._rewrite(query)
     ne = outer.select.namedExpressions
     assert (ne[0].expression.expression.name == 'keycount')
     assert (ne[1].expression.expression.name != 'keycount')
Beispiel #11
0
 def test_reuse_expression(self):
     meta = Metadata.from_file(meta_path)
     df = pd.read_csv(csv_path)
     reader = PandasReader(df, meta)
     private_reader = PrivateReader(reader,
                                    meta,
                                    privacy=Privacy(epsilon=3.0))
     query = 'SELECT AVG(age), SUM(age), COUNT(age) FROM PUMS.PUMS'
     q = QueryParser(meta).query(query)
     inner, outer = private_reader._rewrite(query)
     names = unique(
         [f.name for f in outer.select.namedExpressions.find_nodes(Column)])
     assert (len(names) == 2)
     assert ('count_age' in names)
     assert ('sum_age' in names)
    def test_meta_from_string(self):
        meta_good = """col:
    ? ''
    : table:
        duration:
            type: float
        id:
            private_id: true
            type: int
        max_ids: 1
        row_privacy: false
        rows: 1000
        sample_max_ids: true
engine: pandas"""
        file = io.StringIO(meta_good)
        c = Metadata.from_file(file)
    def test_meta_bad_int(self):
        meta_bad_float = """col:
    ? ''
    : table:
        id:
            private_id: true
            type: int
        events:
            type: int
        max_ids: 1
        row_privacy: false
        rows: 1000
        sample_max_ids: true
engine: pandas"""
        file = io.StringIO(meta_bad_float)
        c = Metadata.from_file(file)
        assert (c["table"]["events"].unbounded)
Beispiel #14
0
 def get_private_readers(self, *ignore, metadata=None, privacy, database, engine=None, overrides={}, **kwargs):
     readers = []
     if metadata is None and database in self.metadata:
         metadata = self.metadata[database]
     if metadata is None:
         print(f"No metadata available for {database}")
         return []
     if isinstance(metadata, str):
         from snsql.metadata import Metadata
         metadata = Metadata.from_file(metadata)
     if len(overrides) > 0:
         # make a copy
         metadata = copy.deepcopy(metadata)
     # apply overrides to only the first table in the metadata
     table_name = list(metadata.m_tables)[0]
     table = metadata.m_tables[table_name]
     for propname in overrides:
         propval = overrides[propname]
         if propname == 'censor_dims':
             table.censor_dims = propval
         elif propname == 'clamp_counts':
             table.clamp_counts = propval
         elif propname == 'clamp_columns':
             table.clamp_columns = propval
         elif propname == 'max_ids' or propname == 'max_contrib':
             table.max_ids = propval
         else:
             print(f"Unable to set override for {propname}={propval}")
     if engine is not None:
         engines = [engine]
     else:
         engines = [eng for eng in self.engines]
     for engine in engines:
         if engine in self.engines:
             eng = self.engines[engine]
             reader = None
             try:
                 reader = eng.get_private_reader(metadata=metadata, privacy=privacy, database=database)
             except:
                 pass
             finally:
                 if reader:
                     readers.append(reader)
     return readers
Beispiel #15
0
 def test_load_from_path(self):
     meta = Metadata.from_(meta_path)
     p = meta['PUMS.PUMS']
Beispiel #16
0
privacy = Privacy(epsilon=3.0, delta=0.1)

metadata = Metadata.from_dict({
    "": {
        "PUMS": {
            "PUMS": {
                "censor_dims": True,
                "row_privacy": False,
                "uuid": {
                    "type": "int",
                    "private_id": True
                },
                "start_date": {
                    "type": "datetime"
                },
                "start_time": {
                    "type": "string"
                },
                "sales": {
                    "type": "float",
                    "upper": 10000,
                    "lower": 0.0
                }
            }
        }
    }
})

reader = from_connection(None,
                         engine="sqlserver",
Beispiel #17
0
import pytest
from snsql.metadata import Metadata
from snsql.sql.parse import QueryParser

from os import listdir
from os.path import isfile, join, dirname

dir_name = dirname(__file__)
testpath = join(dir_name, "queries") + "/"

metadata = Metadata.from_file(join(dir_name, "TestDB.yaml"))

other_dirs = [
    f for f in listdir(testpath)
    if not isfile(join(testpath, f)) and f != "parse"
]

parse_files = [
    join(testpath + "parse/", f) for f in listdir(testpath + "parse")
    if isfile(join(testpath + "parse", f))
]
good_files = [f for f in parse_files if not "_fail" in f]
bad_files = [f for f in parse_files if "_fail" in f]

for d in other_dirs:
    other_files = [
        join(testpath + d + "/", f) for f in listdir(testpath + d)
        if isfile(join(testpath + d, f))
    ]
    good_files.extend(other_files)
Beispiel #18
0
        "type": "boolean"
    },
    "row_privacy": False,
}

metadata_dict = {
    "engine": "sqlserver",
    "": {
        "": {
            "features": table_features,
            "sales": table_sales,
        }
    },
}

metadata = Metadata.from_dict(metadata_dict)


class TestAstFromMeta:
    def test_meta_load(self):
        assert (metadata["sales"]["Date"].nullable == False)
        assert (metadata["features"]["Date"].nullable == True)
        assert (metadata["features"]["IsHoliday"].nullable == False)
        assert (metadata["features"]["Store"].sensitivity == 150)
        assert (metadata["features"]["Temperature"].sensitivity == 75)

    def test_ast_attach_sens(self):
        query = 'SELECT SUM("Temperature"), SUM(features."Store") AS store FROM features'
        q = QueryParser(metadata).query(query)
        assert (q._select_symbols[0].expression.sensitivity() == 75)
        assert (q._named_symbols['store'].expression.sensitivity() == 150)
Beispiel #19
0
from snsql.sql.privacy import Privacy
from snsql.sql.private_reader import PrivateReader
import pandas as pd
import numpy as np

from snsql.metadata import Metadata

git_root_dir = subprocess.check_output(
    "git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip()
meta_path = os.path.join(git_root_dir, os.path.join("datasets",
                                                    "PUMS_pid.yaml"))
csv_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.csv"))
pums = pd.read_csv(csv_path)
privacy = Privacy(epsilon=1.0)

meta_obj = Metadata.from_(meta_path)


class TestOdometer:
    def test_count_pid_query(self):
        priv = PrivateReader.from_connection(pums,
                                             privacy=privacy,
                                             metadata=meta_path)
        assert (priv.odometer.spent == (0.0, 0.0))
        assert (priv.odometer.k == 0)
        res = priv.execute(
            "SELECT COUNT(DISTINCT pid) FROM PUMS.PUMS GROUP BY educ")
        assert (priv.odometer.k == 1)

    def test_count_query(self):
        priv = PrivateReader.from_connection(pums,
Beispiel #20
0
from os.path import dirname, join

from snsql.metadata import Metadata
from snsql.sql.parse import QueryParser

dir_name = dirname(__file__)

metadata = Metadata.from_file(join(dir_name, "Devices.yaml"))


def qp(query_string):
    return QueryParser().query(query_string)


#
#   Unit tests
#
class TestTypes:
    def test_s12(self):
        q = qp("SELECT Refurbished FROM Telemetry.Crashes;")
        q.load_symbols(metadata)
        print(str(q["Refurbished"]))
        assert q["Refurbished"].expression.type() == "boolean"
        assert q["Refurbished"].expression.sensitivity() == 1

    def test_s13(self):
        q = qp("SELECT * FROM Telemetry.Crashes;")
        q.load_symbols(metadata)
        assert q["Refurbished"].expression.type() == "boolean"
        assert q["Refurbished"].expression.sensitivity() == 1
        assert q["Temperature"].expression.sensitivity() == 65.0
Beispiel #21
0
 def __init__(self, metadata=None):
     if metadata:
         self.metadata = Metadata.from_(metadata)
     else:
         self.metadata = None
Beispiel #22
0
import math

import pandas as pd

from snsql.metadata import Metadata
from snsql.sql import PrivateReader
from snsql.sql.privacy import Privacy
from snsql.sql.parse import QueryParser

git_root_dir = subprocess.check_output(
    "git rev-parse --show-toplevel".split(" ")).decode("utf-8").strip()
meta_path = os.path.join(git_root_dir, os.path.join("datasets",
                                                    "PUMS_pid.yaml"))
csv_path = os.path.join(git_root_dir, os.path.join("datasets", "PUMS_pid.csv"))

meta = Metadata.from_file(meta_path)
pums = pd.read_csv(csv_path)
query = 'SELECT AVG(age), STD(age), VAR(age), SUM(age), COUNT(age) FROM PUMS.PUMS GROUP BY sex'
q = QueryParser(meta).query(query)

privacy = Privacy(alphas=[0.01, 0.05], delta=1 / (math.sqrt(100) * 100))
priv = PrivateReader.from_connection(pums, privacy=privacy, metadata=meta)
subquery, root = priv._rewrite(query)

acc = Accuracy(root, subquery, privacy)


class TestAccuracy:
    def test_count_accuracy(self):
        error = acc.count(alpha=0.05)
        assert (error < 7.53978 and error > 0.5)
Beispiel #23
0
 def __init__(self, metadata, privacy=None):
     self.options = RewriterOptions()
     self.metadata = Metadata.from_(metadata)
     self.privacy = privacy