Beispiel #1
0
from pynonymizer.strategy.exceptions import UnknownColumnStrategyError, UnknownTableStrategyError, ConfigSyntaxError
from pynonymizer.log import get_logger
from pynonymizer.strategy.table import UpdateColumnsTableStrategy, TruncateTableStrategy, DeleteTableStrategy, TableStrategyTypes
from pynonymizer.strategy.update_column import (
    UpdateColumnStrategyTypes, EmptyUpdateColumnStrategy,
    UniqueEmailUpdateColumnStrategy, UniqueLoginUpdateColumnStrategy,
    FakeUpdateColumnStrategy, LiteralUpdateColumnStrategy)
from pynonymizer.strategy.database import DatabaseStrategy
from copy import deepcopy

logger = get_logger(__name__)


class StrategyParser:
    def __init__(self, fake_seeder):
        self.fake_seeder = fake_seeder

    @staticmethod
    def __normalize_table_config(table_config):
        # If the table config doesn't have a "type" specified, it needs to be determined automatically.
        if "type" not in table_config:
            if "columns" in table_config:
                return {
                    **table_config,
                    "type":
                    TableStrategyTypes.UPDATE_COLUMNS.value,
                }

            elif table_config == "truncate":
                return {
                    "type": TableStrategyTypes.TRUNCATE.value,
Beispiel #2
0
class MySqlProvider(DatabaseProvider):
    """
    A command-line based mysql provider. Uses `mysql` and `mysqldump`,
    Because of the efficiency of piping mass amounts of sql into the command-line client.
    Unfortunately, this implementation provides limited feedback when things go wrong.
    """
    __CHUNK_SIZE = 8192
    __DUMPSIZE_ESTIMATE_INFLATION = 1.15
    logger = log.get_logger(__name__)

    def __init__(self, db_host, db_user, db_pass, db_name, db_port=None, seed_rows=None, cmd_opts=None, dump_opts=None):
        if db_host is None:
            db_host = "127.0.0.1"
        if db_port is None:
            db_port = "3306"
        if cmd_opts is None:
            cmd_opts = ""
        if dump_opts is None:
            dump_opts = ""

        super().__init__(db_host=db_host, db_user=db_user, db_pass=db_pass, db_name=db_name, db_port=db_port, seed_rows=seed_rows)
        self.__runner = execution.MySqlCmdRunner(db_host, db_user, db_pass, db_name, db_port, additional_opts=cmd_opts)
        self.__dumper = execution.MySqlDumpRunner(db_host, db_user, db_pass, db_name, db_port, additional_opts=dump_opts)

    def __seed(self, qualifier_map):
        """
        'Seed' the database with a bunch of pre-generated random records so updates can be performed in batch updates
        """
        for i in tqdm(range(0, self.seed_rows), desc="Inserting seed data", unit="rows"):
            self.logger.debug(f"Inserting seed row {i}")
            self.__runner.db_execute(query_factory.get_insert_seed_row(SEED_TABLE_NAME, qualifier_map))

    def __estimate_dumpsize(self):
        """
        Makes a guess on the dump size using internal database metrics
        :return: A value in bytes, or None (unknown)
        """
        statement = query_factory.get_dumpsize_estimate(self.db_name)
        process_output = self.__runner.get_single_result(statement)

        try:
            return int(process_output) * self.__DUMPSIZE_ESTIMATE_INFLATION
        except ValueError:
            # Value unparsable, likely NULL
            return None

    def __read_until_empty_byte(self, data):
        return iter(lambda: data.read(self.__CHUNK_SIZE), b'')

    def __run_scripts(self, script_list, title=""):
        for i, script in enumerate(script_list):
            self.logger.info(f"Running {title} script #{i} \"{script[:50]}\"")
            self.logger.info(self.__runner.db_execute(script))

    def create_database(self):
        """Create the working database"""
        self.__runner.execute(query_factory.get_create_database(self.db_name))

    def drop_database(self):
        """Drop the working database"""
        self.__runner.execute(query_factory.get_drop_database(self.db_name))

    def anonymize_database(self, database_strategy):
        """
        Anonymize a restored database using the passed database strategy
        :param database_strategy: a strategy.DatabaseStrategy configuration
        :return:
        """
        qualifier_map = database_strategy.fake_update_qualifier_map

        if len(qualifier_map) > 0:
            self.logger.info("creating seed table with %d columns", len(qualifier_map))
            create_seed_table_sql = query_factory.get_create_seed_table(SEED_TABLE_NAME, qualifier_map)
            self.__runner.db_execute(create_seed_table_sql)

            self.logger.info("Inserting seed data")
            self.__seed(qualifier_map)

        self.__run_scripts(database_strategy.before_scripts, "before")

        table_strategies = database_strategy.table_strategies
        self.logger.info("Anonymizing %d tables", len(table_strategies))

        with tqdm(desc="Anonymizing database", total=len(table_strategies)) as progressbar:
            for table_strategy in table_strategies:
                if table_strategy.schema is not None:
                    self.logger.warning(
                        "%s: MySQL provider does not support table schema. This option will be ignored.",
                        table_strategy.table_name
                    )

                if table_strategy.strategy_type == TableStrategyTypes.TRUNCATE:
                    progressbar.set_description("Truncating {}".format(table_strategy.table_name))
                    self.__runner.db_execute(query_factory.get_truncate_table(table_strategy.table_name))
                
                elif table_strategy.strategy_type == TableStrategyTypes.DELETE:
                    progressbar.set_description("Deleting {}".format(table_strategy.table_name))
                    self.__runner.db_execute(query_factory.get_delete_table(table_strategy.table_name))

                elif table_strategy.strategy_type == TableStrategyTypes.UPDATE_COLUMNS:
                    progressbar.set_description("Anonymizing {}".format(table_strategy.table_name))
                    statements = query_factory.get_update_table(SEED_TABLE_NAME, table_strategy)
                    self.__runner.db_execute(statements)

                else:
                    raise UnsupportedTableStrategyError(table_strategy)

                progressbar.update()

        self.__run_scripts(database_strategy.after_scripts, "after")

        self.logger.info("dropping seed table")
        self.__runner.db_execute(query_factory.get_drop_seed_table(SEED_TABLE_NAME))

        # Wait an arbitrary amount of time here to prevent this step from interacting with 
        # transactional dump operations
        self.logger.debug("Waiting for trailing operations to complete...")
        sleep(0.2)

    def restore_database(self, input_path):
        """
        Feed a mysqldump dumpfile to the mysql binary on stdin.
        :param input_path:
        :return:
        """
        input_obj = resolve_input(input_path)
        dumpsize = input_obj.get_size()

        batch_processor = self.__runner.open_batch_processor()
        with input_obj.open() as dumpfile_data:
            with tqdm(desc="Restoring", total=dumpsize, unit='B', unit_scale=True, unit_divisor=1000) as bar:
                for chunk in self.__read_until_empty_byte(dumpfile_data):
                    batch_processor.write(chunk)
                    batch_processor.flush()
                    bar.update(len(chunk))

    def dump_database(self, output_path):
        """
        Feed an output with stdout from the mysqldump binary
        :param output_path:
        :return:
        """
        output_obj = resolve_output(output_path)
        dumpsize_estimate = self.__estimate_dumpsize()

        dump_process = self.__dumper.open_dumper()
        with output_obj.open() as output_file:
            with tqdm(desc="Dumping", total=dumpsize_estimate, unit='B', unit_scale=True, unit_divisor=1000) as bar:
                for chunk in self.__read_until_empty_byte(dump_process):
                    output_file.write(chunk)
                    bar.update(len(chunk))
Beispiel #3
0
class MsSqlProvider(DatabaseProvider):
    """
    A pyodbc-based MSSQL provider.
    """

    logger = get_logger(__name__)

    # stats value for restore/backup command: Report progress every X percent
    # A lower value means MORE resultssets / more frequent updates from the backup command.
    # Values lower than 5 often yield unreliable results on smaller backups
    __STATS = 5

    def __init__(self,
                 db_host,
                 db_user,
                 db_pass,
                 db_name,
                 db_port=None,
                 seed_rows=None,
                 backup_compression=False):
        # import here for fast-failiness
        import pyodbc

        if db_host is not None:
            raise DependencyError(
                "db_host",
                "MsSqlProvider does not support remote servers due to backup file "
                "location requirements. You must omit db_host from your configuration "
                "and run pynonymizer on the same server as the database.")

        # TODO: The odbc port syntax doesn't seem to work with (local),1433 or port=1433
        # TODO: This needs attention, but as it's backwards compatible, we're going to disallow it here.
        if db_port is not None:
            raise DependencyError(
                "db_port",
                "MsSqlProvider does not support custom ports. You must omit db_port "
                "from your configuration to continue.")

        db_host = "(local)"

        super().__init__(db_host=db_host,
                         db_user=db_user,
                         db_pass=db_pass,
                         db_name=db_name,
                         db_port=db_port,
                         seed_rows=seed_rows)
        self.__conn = None
        self.__db_conn = None
        self.__backup_compression = backup_compression

    def __connection(self):
        import pyodbc
        """a lazy-evaluated connection"""
        if self.__conn is None:
            self.__conn = pyodbc.connect(
                driver="{SQL Server Native Client 11.0}",
                server=self.db_host,
                uid=self.db_user,
                pwd=self.db_pass,
                autocommit=True)

        return self.__conn

    def __db_connection(self):
        import pyodbc
        """a lazy-evaluated db-specific connection"""
        if self.__db_conn is None:
            self.__db_conn = pyodbc.connect(
                driver="{SQL Server Native Client 11.0}",
                database=self.db_name,
                server=self.db_host,
                uid=self.db_user,
                pwd=self.db_pass,
                autocommit=True)

        return self.__db_conn

    def __execute(self, *args, **kwargs):
        return self.__connection().execute(*args, **kwargs)

    def __db_execute(self, *args, **kwargs):
        return self.__db_connection().execute(*args, **kwargs)

    def __get_path(self, filepath):
        if "\\" in filepath:
            return PureWindowsPath(filepath)
        else:
            return PurePosixPath(filepath)

    def __get_default_datafolder(self):
        """
        Locate the default data folder using the `model` database location
        It's possible that the model database is not the currently set default, i.e if it's been changed after install
        The solution to this would be to make a new database and then perform the below check on that instead.
        See https://blogs.technet.microsoft.com/sqlman/2009/07/19/tsql-script-determining-default-database-file-log-path/

        However, this seems like a heavyweight solution for what is essentially a tsql-writeable tempfolder, so
        checking the model db seems like a good 'boring' solution
        :return: Default data directory e.g. "C:\\DATA"
        """
        datafile = self.__execute("""
        SELECT physical_name
        FROM sys.master_files mf
        INNER JOIN sys.[databases] d
        ON mf.[database_id] = d.[database_id]
        WHERE d.[name] = 'model' AND type = 0
        """).fetchone()[0]

        return self.__get_path(datafile).parent

    def __get_default_logfolder(self):
        """
        Locate the default log folder using the `model` database location
        __get_default_datafolder: see for more info
        :return:
        """
        logfile = self.__execute("""
        SELECT physical_name
        FROM sys.master_files mf
        INNER JOIN sys.[databases] d
        ON mf.[database_id] = d.[database_id]
        WHERE d.[name] = 'model' AND type = 1
        """).fetchone()[0]

        return self.__get_path(logfile).parent

    def __get_file_moves(self, input_path):
        """
        Using RESTORE FILELISTONLY, get all the files in the backup that need to be moved to the local system for restore
        :return: a dict of file name: new destination
        """
        datadir = self.__get_default_datafolder()
        logdir = self.__get_default_logfolder()

        filelist = self.__execute(f"RESTORE FILELISTONLY FROM DISK = ?;",
                                  input_path).fetchall()

        move_file_map = {}
        for file in filelist:
            name = file[0]
            type = file[2].upper()
            filepath = self.__get_path(file[1])

            # log files can go into the default log directory, everything else can go into the data directory
            if type == "L":
                target_path = str(
                    logdir.joinpath(f"{self.db_name}_{filepath.name}"))
            else:
                target_path = str(
                    datadir.joinpath(f"{self.db_name}_{filepath.name}"))

            move_file_map[name] = target_path

        return move_file_map

    def __async_operation_progress(self, desc, cursor):
        # With STATS=x, we should recieve 100/x resultsets, provided the backup is slow enough.
        # With some databases, it will jump from y% to 100, so we'll only get <x nextset calls.
        # Even SSMS doesn't get informed (read: it's not my fault, blame microsoft)
        with tqdm(desc=desc,
                  total=math.floor(100 / self.__STATS)) as progressbar:
            while cursor.nextset():
                progressbar.update()

            # finish the progress - less confusing than a dangling 40% progressbar
            progressbar.update(progressbar.total - progressbar.n)

    def __run_scripts(self, script_list, title=""):
        import pyodbc
        for i, script in enumerate(script_list):
            self.logger.info(f"Running f{title} script #{i} \"{script[:50]}\"")
            cursor = self.__db_execute(script)
            results = None
            try:
                results = cursor.fetchall()
            except pyodbc.Error:
                pass
            self.logger.info(results)

    def __create_seed_table(self, qualifier_map):
        seed_column_lines = [
            "[{}] {}".format(name, _FAKE_COLUMN_TYPES[col.data_type])
            for name, col in qualifier_map.items()
        ]
        create_statement = "CREATE TABLE [{}]({});".format(
            SEED_TABLE_NAME, ",".join(seed_column_lines))

        self.__db_execute(create_statement)

    def __drop_seed_table(self):
        self.__db_execute("DROP TABLE IF EXISTS [{}];".format(SEED_TABLE_NAME))

    def __insert_seed_row(self, qualifier_map):
        column_list = ",".join(
            ["[{}]".format(qualifier) for qualifier in qualifier_map])
        substitution_list = ",".join(
            [" ?".format(qualifier) for qualifier in qualifier_map])
        value_list = [
            column.value for qualifier, column in qualifier_map.items()
        ]

        statement = "INSERT INTO [{}]({}) VALUES ({});".format(
            SEED_TABLE_NAME, column_list, substitution_list)
        self.__db_execute(statement, value_list)

    def __seed(self, qualifier_map):
        for i in tqdm(range(0, self.seed_rows),
                      desc="Inserting seed data",
                      unit="rows"):
            self.__insert_seed_row(qualifier_map)

    def __get_column_subquery(self, column_strategy):
        if column_strategy.strategy_type == UpdateColumnStrategyTypes.EMPTY:
            return "('')"
        elif column_strategy.strategy_type == UpdateColumnStrategyTypes.UNIQUE_EMAIL:
            return f"( SELECT CONCAT(NEWID(), '@', NEWID(), '.com') )"
        elif column_strategy.strategy_type == UpdateColumnStrategyTypes.UNIQUE_LOGIN:
            return f"( SELECT NEWID() )"
        elif column_strategy.strategy_type == UpdateColumnStrategyTypes.FAKE_UPDATE:
            return f"( SELECT TOP 1 [{column_strategy.qualifier}] FROM [{SEED_TABLE_NAME}] ORDER BY NEWID())"
        elif column_strategy.strategy_type == UpdateColumnStrategyTypes.LITERAL:
            return column_strategy.value
        else:
            raise UnsupportedColumnStrategyError(column_strategy)

    def create_database(self):
        self.logger.warning(
            "MSSQL: create_database ignored, database will be created when restore_db is run"
        )

    def drop_database(self):
        # force connection close so we can always drop the db: sometimes timing makes a normal drop impossible.
        self.__execute(
            f"ALTER DATABASE [{self.db_name}] SET SINGLE_USER WITH ROLLBACK IMMEDIATE"
        )
        self.__execute(f"DROP DATABASE IF EXISTS [{self.db_name}];")

    def anonymize_database(self, database_strategy):
        qualifier_map = database_strategy.fake_update_qualifier_map

        if len(qualifier_map) > 0:
            self.logger.info("creating seed table with %d columns",
                             len(qualifier_map))
            self.__create_seed_table(qualifier_map)

            self.logger.info("Inserting seed data")
            self.__seed(qualifier_map)

        self.__run_scripts(database_strategy.before_scripts, "before")

        table_strategies = database_strategy.table_strategies
        self.logger.info("Anonymizing %d tables", len(table_strategies))

        with tqdm(desc="Anonymizing database",
                  total=len(table_strategies)) as progressbar:
            for table_strategy in table_strategies:
                table_name = table_strategy.table_name
                schema_prefix = f"[{table_strategy.schema}]." if table_strategy.schema else ""

                if table_strategy.strategy_type == TableStrategyTypes.TRUNCATE:
                    progressbar.set_description(
                        "Truncating {}".format(table_name))
                    self.__db_execute("TRUNCATE TABLE {}[{}];".format(
                        schema_prefix, table_name))

                elif table_strategy.strategy_type == TableStrategyTypes.DELETE:
                    progressbar.set_description(
                        "Deleting {}".format(table_name))
                    self.__db_execute("DELETE FROM {}[{}];".format(
                        schema_prefix, table_name))

                elif table_strategy.strategy_type == TableStrategyTypes.UPDATE_COLUMNS:
                    progressbar.set_description(
                        "Anonymizing {}".format(table_name))
                    where_grouping = table_strategy.group_by_where()
                    total_wheres = len(where_grouping)

                    for i, (where,
                            column_map) in enumerate(where_grouping.items()):
                        column_assignments = ",".join([
                            "[{}] = {}".format(
                                name, self.__get_column_subquery(column))
                            for name, column in column_map.items()
                        ])
                        where_clause = f" WHERE {where}" if where else ""
                        progressbar.set_description(
                            "Anonymizing {}: w[{}/{}]".format(
                                table_name, i + 1, total_wheres))
                        self.__db_execute("UPDATE {}[{}] SET {}{};".format(
                            schema_prefix, table_name, column_assignments,
                            where_clause))

                else:
                    raise UnsupportedTableStrategyError(table_strategy)

                progressbar.update()

        self.__run_scripts(database_strategy.after_scripts, "after")

        self.logger.info("Dropping seed table")
        self.__drop_seed_table()

    def restore_database(self, input_path):
        move_files = self.__get_file_moves(input_path)

        self.logger.info("Found %d files in %s", len(move_files), input_path)
        self.logger.debug(move_files)

        # get move statements and flatten pairs out so we can do the 2-param substitution
        move_clauses = ", ".join(["MOVE ? TO ?"] * len(move_files))
        move_clause_params = [
            item for pair in move_files.items() for item in pair
        ]

        restore_cursor = self.__execute(
            f"RESTORE DATABASE ? FROM DISK = ? WITH {move_clauses}, STATS = ?;",
            [self.db_name, input_path, *move_clause_params, self.__STATS])

        self.__async_operation_progress("Restoring Database", restore_cursor)

    def dump_database(self, output_path):
        with_options = []
        if self.__backup_compression:
            with_options.append("COMPRESSION")

        with_options_str = ",".join(with_options) + ", " if len(
            with_options) > 0 else ""

        dump_cursor = self.__execute(
            f"BACKUP DATABASE ? TO DISK = ? WITH {with_options_str}STATS = ?;",
            [self.db_name, output_path, self.__STATS])
        self.__async_operation_progress("Dumping Database", dump_cursor)
Beispiel #4
0
def test_no_logger_name_str(get_logger_mock):
    with pytest.raises(ValueError):
        logger = get_logger("")
Beispiel #5
0
def test_no_logger_name_array(get_logger_mock):
    with pytest.raises(ValueError):
        logger = get_logger([])
Beispiel #6
0
def test_multipart_name(get_logger_mock):
    logger = get_logger(["a", "b", "cd", "ef", "ghi"])
    get_logger_mock.assert_called_once_with("pynonymizer.a.b.cd.ef.ghi")
Beispiel #7
0
def test_simple_name(get_logger_mock):
    logger = get_logger("test")
    get_logger_mock.assert_called_once_with("pynonymizer.test")