Exemple #1
0
    def subset_downstream(self, table, relationships):
        referencing_tables = self.__db_helper.get_redacted_table_references(
            table, self.__all_tables, self.__source_conn)

        if len(referencing_tables) > 0:
            pk_columns = referencing_tables[0]['target_columns']
        else:
            return

        temp_table = self.__db_helper.create_id_temp_table(
            self.__destination_conn, len(pk_columns))

        for r in referencing_tables:
            fk_table = r['fk_table']
            fk_columns = r['fk_columns']

            q = 'SELECT {} FROM {} WHERE {} NOT IN (SELECT {} FROM {})'.format(
                columns_joined(fk_columns),
                fully_qualified_table(
                    mysql_db_name_hack(fk_table, self.__destination_conn)),
                columns_tupled(fk_columns), columns_joined(pk_columns),
                fully_qualified_table(
                    mysql_db_name_hack(table, self.__destination_conn)))
            self.__db_helper.copy_rows(self.__destination_conn,
                                       self.__destination_write_conn, q,
                                       temp_table)

        columns_query = columns_to_copy(table, relationships,
                                        self.__source_conn)

        cursor_name = 'table_cursor_' + str(uuid.uuid4()).replace('-', '')
        cursor = self.__destination_conn.cursor(name=cursor_name,
                                                withhold=True)
        cursor_query = 'SELECT DISTINCT * FROM {}'.format(
            fully_qualified_table(temp_table))
        cursor.execute(cursor_query)
        fetch_row_count = 100000
        while True:
            rows = cursor.fetchmany(fetch_row_count)
            if len(rows) == 0:
                break

            ids = [
                '(' + ','.join(['\'' + str(c) + '\'' for c in row]) + ')'
                for row in rows if all([c is not None for c in row])
            ]

            if len(ids) == 0:
                break

            ids_to_query = ','.join(ids)
            q = 'SELECT {} FROM {} WHERE {} IN ({})'.format(
                columns_query, fully_qualified_table(table),
                columns_tupled(pk_columns), ids_to_query)
            self.__db_helper.copy_rows(
                self.__source_conn, self.__destination_conn, q,
                mysql_db_name_hack(table, self.__destination_conn))

        cursor.close()
def copy_rows(source, destination, query, destination_table):
    cursor = source.cursor()

    try:
        cursor.execute(query)
        fetch_row_count = 1000
        while True:
            rows = cursor.fetchmany(fetch_row_count)
            if len(rows) == 0:
                break

            template = ','.join(['%s'] * len(rows[0]))
            destination_cursor = destination.cursor()
            insert_query = 'INSERT INTO {} VALUES ({})'.format(
                fully_qualified_table(destination_table), template)
            destination_cursor.executemany(insert_query, rows)

            destination_cursor.close()
            destination.commit()

            if len(rows) < fetch_row_count:
                # necessary because mysql doesn't behave if you fetchmany after the last row
                break
    except Exception as e:
        if hasattr(e, 'msg') and e.msg.startswith('Table') and e.msg.endswith(
                'doesn\'t exist'):
            raise ValueError(
                'Your database has foreign keys to another database. This is not currently supported.'
            )
        else:
            raise e
    finally:
        cursor.close()
def clean_temp_table_cells(fk_table, fk_columns, target_table, target_columns,
                           conn):
    fk_alias = 'tonic_subset_398dhjr23_fk'
    target_alias = 'tonic_subset_398dhjr23_target'

    fk_table = fully_qualified_table(source_db_temp_table(fk_table))
    target_table = fully_qualified_table(source_db_temp_table(target_table))
    assignment_list = ','.join(
        ['{} = NULL'.format(quoter(c)) for c in fk_columns])
    column_matching = ' AND '.join([
        '{}.{} = {}.{}'.format(fk_alias, quoter(fc), target_alias, quoter(tc))
        for fc, tc in zip(fk_columns, target_columns)
    ])
    q = 'UPDATE {} {} SET {} WHERE NOT EXISTS (SELECT 1 FROM {} {} WHERE {})'.format(
        fk_table, fk_alias, assignment_list, target_table, target_alias,
        column_matching)
    run_query(q, conn)
def create_id_temp_table(conn, number_of_columns):
    temp_table = temp_db + '.' + str(uuid.uuid4())
    cursor = conn.cursor()
    column_defs = ',\n'.join(
        ['    col' + str(aye) + '  text' for aye in range(number_of_columns)])
    q = 'CREATE TABLE {} (\n {} \n)'.format(fully_qualified_table(temp_table),
                                            column_defs)
    cursor.execute(q)
    cursor.close()
    return temp_table
def copy_to_temp_table(conn, query, target_table, pk_columns=None):
    temp_table = fully_qualified_table(source_db_temp_table(target_table))
    with conn.cursor() as cur:
        cur.execute('CREATE TEMPORARY TABLE IF NOT EXISTS ' + temp_table +
                    ' AS ' + query + ' LIMIT 0')
        if pk_columns:
            query = query + ' WHERE {} NOT IN (SELECT {} FROM {})'.format(
                columns_tupled(pk_columns), columns_joined(pk_columns),
                temp_table)
        cur.execute('INSERT INTO ' + temp_table + ' ' + query)
        conn.commit()
Exemple #6
0
 def __subset_direct(self, target, relationships):
     t = target['table']
     columns_query = columns_to_copy(t, relationships, self.__source_conn)
     if 'where' in target:
         q = 'SELECT {} FROM {} WHERE {}'.format(columns_query,
                                                 fully_qualified_table(t),
                                                 target['where'])
     elif 'percent' in target:
         if config_reader.get_db_type() == 'postgres':
             q = 'SELECT {} FROM {} WHERE random() < {}'.format(
                 columns_query, fully_qualified_table(t),
                 float(target['percent']) / 100)
         else:
             q = 'SELECT {} FROM {} WHERE rand() < {}'.format(
                 columns_query, fully_qualified_table(t),
                 float(target['percent']) / 100)
     else:
         raise ValueError(
             'target table {} had no \'where\' or \'percent\' term defined, check your configuration.'
             .format(t))
     self.__db_helper.copy_rows(
         self.__source_conn, self.__destination_conn, q,
         mysql_db_name_hack(t, self.__destination_conn))
def copy_rows(source, destination, query, destination_table):
    datatypes = get_table_datatypes(table_name(destination_table),
                                    schema_name(destination_table),
                                    destination)

    def template_piece(dt):
        if dt == '_json':
            return '%s::json[]'
        elif dt == '_jsonb':
            return '%s::jsonb[]'
        else:
            return '%s'

    template = '(' + ','.join([template_piece(dt) for dt in datatypes]) + ')'

    cursor_name = 'table_cursor_' + str(uuid.uuid4()).replace('-', '')
    cursor = source.cursor(name=cursor_name)
    cursor.execute(query)

    fetch_row_count = 100000
    while True:
        rows = cursor.fetchmany(fetch_row_count)
        if len(rows) == 0:
            break

        # we end up doing a lot of execute statements here, copying data.
        # using the inner_cursor means we don't log all the noise
        destination_cursor = destination.cursor().inner_cursor

        insert_query = 'INSERT INTO {} VALUES %s'.format(
            fully_qualified_table(destination_table))

        execute_values(destination_cursor, insert_query, rows, template)

        destination_cursor.close()

    cursor.close()
    destination.commit()
Exemple #8
0
    def run_middle_out(self):
        passthrough_tables = self.__get_passthrough_tables()
        relationships = self.__db_helper.get_unredacted_fk_relationships(
            self.__all_tables, self.__source_conn)
        disconnected_tables = compute_disconnected_tables(
            config_reader.get_initial_target_tables(), passthrough_tables,
            self.__all_tables, relationships)
        connected_tables = [
            table for table in self.__all_tables
            if table not in disconnected_tables
        ]
        order = get_topological_order_by_tables(relationships,
                                                connected_tables)
        order = list(order)

        # start by subsetting the direct targets
        print('Beginning subsetting with these direct targets: ' +
              str(config_reader.get_initial_target_tables()))
        start_time = time.time()
        processed_tables = set()
        for idx, target in enumerate(config_reader.get_initial_targets()):
            print_progress(target, idx + 1,
                           len(config_reader.get_initial_targets()))
            self.__subset_direct(target, relationships)
            processed_tables.add(target['table'])
        print('Direct target tables completed in {}s'.format(time.time() -
                                                             start_time))

        # greedily grab rows with foreign keys to rows in the target strata
        upstream_tables = compute_upstream_tables(
            config_reader.get_initial_target_tables(), order)
        print('Beginning greedy upstream subsetting with these tables: ' +
              str(upstream_tables))
        start_time = time.time()
        for idx, t in enumerate(upstream_tables):
            print_progress(t, idx + 1, len(upstream_tables))
            data_added = self.__subset_upstream(t, processed_tables,
                                                relationships)
            if data_added:
                processed_tables.add(t)
        print('Greedy subsettings completed in {}s'.format(time.time() -
                                                           start_time))

        # process pass-through tables, you need this before subset_downstream, so you can get all required downstream rows
        print('Beginning pass-through tables: ' + str(passthrough_tables))
        start_time = time.time()
        for idx, t in enumerate(passthrough_tables):
            print_progress(t, idx + 1, len(passthrough_tables))
            q = 'SELECT * FROM {}'.format(fully_qualified_table(t))
            self.__db_helper.copy_rows(
                self.__source_conn, self.__destination_conn, q,
                mysql_db_name_hack(t, self.__destination_conn))
        print('Pass-through completed in {}s'.format(time.time() - start_time))

        # use subset_downstream to get all supporting rows according to existing needs
        downstream_tables = compute_downstream_tables(passthrough_tables,
                                                      disconnected_tables,
                                                      order)
        print('Beginning downstream subsetting with these tables: ' +
              str(downstream_tables))
        start_time = time.time()
        for idx, t in enumerate(downstream_tables):
            print_progress(t, idx + 1, len(downstream_tables))
            self.subset_downstream(t, relationships)
        print('Downstream subsetting completed in {}s'.format(time.time() -
                                                              start_time))

        if config_reader.keep_disconnected_tables():
            # get all the data for tables in disconnected components (i.e. pass those tables through)
            print('Beginning disconnected tables: ' + str(disconnected_tables))
            start_time = time.time()
            for idx, t in enumerate(disconnected_tables):
                print_progress(t, idx + 1, len(disconnected_tables))
                q = 'SELECT * FROM {}'.format(fully_qualified_table(t))
                self.__db_helper.copy_rows(
                    self.__source_conn, self.__destination_conn, q,
                    mysql_db_name_hack(t, self.__destination_conn))
            print('Disconnected tables completed in {}s'.format(time.time() -
                                                                start_time))
Exemple #9
0
    def __subset_upstream(self, target, processed_tables, relationships):

        redacted_relationships = redact_relationships(relationships)
        relevant_key_constraints = list(
            filter(
                lambda r: r['target_table'] in processed_tables and r[
                    'fk_table'] == target, redacted_relationships))
        # this table isn't referenced by anything we've already processed, so let's leave it empty
        #  OR
        # table was already added, this only happens if the upstream table was also a direct target
        if len(relevant_key_constraints) == 0 or target in processed_tables:
            return False

        temp_target_name = 'subset_temp_' + table_name(target)

        try:
            # copy the whole table
            columns_query = columns_to_copy(target, relationships,
                                            self.__source_conn)
            self.__db_helper.run_query(
                'CREATE TEMPORARY TABLE {} AS SELECT * FROM {} LIMIT 0'.format(
                    quoter(temp_target_name),
                    fully_qualified_table(
                        mysql_db_name_hack(target, self.__destination_conn))),
                self.__destination_conn)
            query = 'SELECT {} FROM {}'.format(columns_query,
                                               fully_qualified_table(target))
            self.__db_helper.copy_rows(self.__source_conn,
                                       self.__destination_conn, query,
                                       temp_target_name)

            # filter it down in the target database
            table_columns = self.__db_helper.get_table_columns(
                table_name(target), schema_name(target), self.__source_conn)
            clauses = [
                '{} IN (SELECT {} FROM {})'.format(
                    columns_tupled(kc['fk_columns']),
                    columns_joined(kc['target_columns']),
                    fully_qualified_table(
                        mysql_db_name_hack(kc['target_table'],
                                           self.__destination_conn)))
                for kc in relevant_key_constraints
            ]
            clauses.extend(upstream_filter_match(target, table_columns))

            select_query = 'SELECT * FROM {} WHERE TRUE AND {}'.format(
                quoter(temp_target_name), ' AND '.join(clauses))
            insert_query = 'INSERT INTO {} {}'.format(
                fully_qualified_table(
                    mysql_db_name_hack(target, self.__destination_conn)),
                select_query)
            self.__db_helper.run_query(insert_query, self.__destination_conn)
            self.__destination_conn.commit()

        finally:
            # delete temporary table
            mysql_temporary = 'TEMPORARY' if config_reader.get_db_type(
            ) == 'mysql' else ''
            self.__db_helper.run_query(
                'DROP {} TABLE IF EXISTS {}'.format(mysql_temporary,
                                                    quoter(temp_target_name)),
                self.__destination_conn)

        return True