def subset_downstream(self, table, relationships): referencing_tables = self.__db_helper.get_redacted_table_references( table, self.__all_tables, self.__source_conn) if len(referencing_tables) > 0: pk_columns = referencing_tables[0]['target_columns'] else: return temp_table = self.__db_helper.create_id_temp_table( self.__destination_conn, len(pk_columns)) for r in referencing_tables: fk_table = r['fk_table'] fk_columns = r['fk_columns'] q = 'SELECT {} FROM {} WHERE {} NOT IN (SELECT {} FROM {})'.format( columns_joined(fk_columns), fully_qualified_table( mysql_db_name_hack(fk_table, self.__destination_conn)), columns_tupled(fk_columns), columns_joined(pk_columns), fully_qualified_table( mysql_db_name_hack(table, self.__destination_conn))) self.__db_helper.copy_rows(self.__destination_conn, self.__destination_write_conn, q, temp_table) columns_query = columns_to_copy(table, relationships, self.__source_conn) cursor_name = 'table_cursor_' + str(uuid.uuid4()).replace('-', '') cursor = self.__destination_conn.cursor(name=cursor_name, withhold=True) cursor_query = 'SELECT DISTINCT * FROM {}'.format( fully_qualified_table(temp_table)) cursor.execute(cursor_query) fetch_row_count = 100000 while True: rows = cursor.fetchmany(fetch_row_count) if len(rows) == 0: break ids = [ '(' + ','.join(['\'' + str(c) + '\'' for c in row]) + ')' for row in rows if all([c is not None for c in row]) ] if len(ids) == 0: break ids_to_query = ','.join(ids) q = 'SELECT {} FROM {} WHERE {} IN ({})'.format( columns_query, fully_qualified_table(table), columns_tupled(pk_columns), ids_to_query) self.__db_helper.copy_rows( self.__source_conn, self.__destination_conn, q, mysql_db_name_hack(table, self.__destination_conn)) cursor.close()
def copy_rows(source, destination, query, destination_table): cursor = source.cursor() try: cursor.execute(query) fetch_row_count = 1000 while True: rows = cursor.fetchmany(fetch_row_count) if len(rows) == 0: break template = ','.join(['%s'] * len(rows[0])) destination_cursor = destination.cursor() insert_query = 'INSERT INTO {} VALUES ({})'.format( fully_qualified_table(destination_table), template) destination_cursor.executemany(insert_query, rows) destination_cursor.close() destination.commit() if len(rows) < fetch_row_count: # necessary because mysql doesn't behave if you fetchmany after the last row break except Exception as e: if hasattr(e, 'msg') and e.msg.startswith('Table') and e.msg.endswith( 'doesn\'t exist'): raise ValueError( 'Your database has foreign keys to another database. This is not currently supported.' ) else: raise e finally: cursor.close()
def clean_temp_table_cells(fk_table, fk_columns, target_table, target_columns, conn): fk_alias = 'tonic_subset_398dhjr23_fk' target_alias = 'tonic_subset_398dhjr23_target' fk_table = fully_qualified_table(source_db_temp_table(fk_table)) target_table = fully_qualified_table(source_db_temp_table(target_table)) assignment_list = ','.join( ['{} = NULL'.format(quoter(c)) for c in fk_columns]) column_matching = ' AND '.join([ '{}.{} = {}.{}'.format(fk_alias, quoter(fc), target_alias, quoter(tc)) for fc, tc in zip(fk_columns, target_columns) ]) q = 'UPDATE {} {} SET {} WHERE NOT EXISTS (SELECT 1 FROM {} {} WHERE {})'.format( fk_table, fk_alias, assignment_list, target_table, target_alias, column_matching) run_query(q, conn)
def create_id_temp_table(conn, number_of_columns): temp_table = temp_db + '.' + str(uuid.uuid4()) cursor = conn.cursor() column_defs = ',\n'.join( [' col' + str(aye) + ' text' for aye in range(number_of_columns)]) q = 'CREATE TABLE {} (\n {} \n)'.format(fully_qualified_table(temp_table), column_defs) cursor.execute(q) cursor.close() return temp_table
def copy_to_temp_table(conn, query, target_table, pk_columns=None): temp_table = fully_qualified_table(source_db_temp_table(target_table)) with conn.cursor() as cur: cur.execute('CREATE TEMPORARY TABLE IF NOT EXISTS ' + temp_table + ' AS ' + query + ' LIMIT 0') if pk_columns: query = query + ' WHERE {} NOT IN (SELECT {} FROM {})'.format( columns_tupled(pk_columns), columns_joined(pk_columns), temp_table) cur.execute('INSERT INTO ' + temp_table + ' ' + query) conn.commit()
def __subset_direct(self, target, relationships): t = target['table'] columns_query = columns_to_copy(t, relationships, self.__source_conn) if 'where' in target: q = 'SELECT {} FROM {} WHERE {}'.format(columns_query, fully_qualified_table(t), target['where']) elif 'percent' in target: if config_reader.get_db_type() == 'postgres': q = 'SELECT {} FROM {} WHERE random() < {}'.format( columns_query, fully_qualified_table(t), float(target['percent']) / 100) else: q = 'SELECT {} FROM {} WHERE rand() < {}'.format( columns_query, fully_qualified_table(t), float(target['percent']) / 100) else: raise ValueError( 'target table {} had no \'where\' or \'percent\' term defined, check your configuration.' .format(t)) self.__db_helper.copy_rows( self.__source_conn, self.__destination_conn, q, mysql_db_name_hack(t, self.__destination_conn))
def copy_rows(source, destination, query, destination_table): datatypes = get_table_datatypes(table_name(destination_table), schema_name(destination_table), destination) def template_piece(dt): if dt == '_json': return '%s::json[]' elif dt == '_jsonb': return '%s::jsonb[]' else: return '%s' template = '(' + ','.join([template_piece(dt) for dt in datatypes]) + ')' cursor_name = 'table_cursor_' + str(uuid.uuid4()).replace('-', '') cursor = source.cursor(name=cursor_name) cursor.execute(query) fetch_row_count = 100000 while True: rows = cursor.fetchmany(fetch_row_count) if len(rows) == 0: break # we end up doing a lot of execute statements here, copying data. # using the inner_cursor means we don't log all the noise destination_cursor = destination.cursor().inner_cursor insert_query = 'INSERT INTO {} VALUES %s'.format( fully_qualified_table(destination_table)) execute_values(destination_cursor, insert_query, rows, template) destination_cursor.close() cursor.close() destination.commit()
def run_middle_out(self): passthrough_tables = self.__get_passthrough_tables() relationships = self.__db_helper.get_unredacted_fk_relationships( self.__all_tables, self.__source_conn) disconnected_tables = compute_disconnected_tables( config_reader.get_initial_target_tables(), passthrough_tables, self.__all_tables, relationships) connected_tables = [ table for table in self.__all_tables if table not in disconnected_tables ] order = get_topological_order_by_tables(relationships, connected_tables) order = list(order) # start by subsetting the direct targets print('Beginning subsetting with these direct targets: ' + str(config_reader.get_initial_target_tables())) start_time = time.time() processed_tables = set() for idx, target in enumerate(config_reader.get_initial_targets()): print_progress(target, idx + 1, len(config_reader.get_initial_targets())) self.__subset_direct(target, relationships) processed_tables.add(target['table']) print('Direct target tables completed in {}s'.format(time.time() - start_time)) # greedily grab rows with foreign keys to rows in the target strata upstream_tables = compute_upstream_tables( config_reader.get_initial_target_tables(), order) print('Beginning greedy upstream subsetting with these tables: ' + str(upstream_tables)) start_time = time.time() for idx, t in enumerate(upstream_tables): print_progress(t, idx + 1, len(upstream_tables)) data_added = self.__subset_upstream(t, processed_tables, relationships) if data_added: processed_tables.add(t) print('Greedy subsettings completed in {}s'.format(time.time() - start_time)) # process pass-through tables, you need this before subset_downstream, so you can get all required downstream rows print('Beginning pass-through tables: ' + str(passthrough_tables)) start_time = time.time() for idx, t in enumerate(passthrough_tables): print_progress(t, idx + 1, len(passthrough_tables)) q = 'SELECT * FROM {}'.format(fully_qualified_table(t)) self.__db_helper.copy_rows( self.__source_conn, self.__destination_conn, q, mysql_db_name_hack(t, self.__destination_conn)) print('Pass-through completed in {}s'.format(time.time() - start_time)) # use subset_downstream to get all supporting rows according to existing needs downstream_tables = compute_downstream_tables(passthrough_tables, disconnected_tables, order) print('Beginning downstream subsetting with these tables: ' + str(downstream_tables)) start_time = time.time() for idx, t in enumerate(downstream_tables): print_progress(t, idx + 1, len(downstream_tables)) self.subset_downstream(t, relationships) print('Downstream subsetting completed in {}s'.format(time.time() - start_time)) if config_reader.keep_disconnected_tables(): # get all the data for tables in disconnected components (i.e. pass those tables through) print('Beginning disconnected tables: ' + str(disconnected_tables)) start_time = time.time() for idx, t in enumerate(disconnected_tables): print_progress(t, idx + 1, len(disconnected_tables)) q = 'SELECT * FROM {}'.format(fully_qualified_table(t)) self.__db_helper.copy_rows( self.__source_conn, self.__destination_conn, q, mysql_db_name_hack(t, self.__destination_conn)) print('Disconnected tables completed in {}s'.format(time.time() - start_time))
def __subset_upstream(self, target, processed_tables, relationships): redacted_relationships = redact_relationships(relationships) relevant_key_constraints = list( filter( lambda r: r['target_table'] in processed_tables and r[ 'fk_table'] == target, redacted_relationships)) # this table isn't referenced by anything we've already processed, so let's leave it empty # OR # table was already added, this only happens if the upstream table was also a direct target if len(relevant_key_constraints) == 0 or target in processed_tables: return False temp_target_name = 'subset_temp_' + table_name(target) try: # copy the whole table columns_query = columns_to_copy(target, relationships, self.__source_conn) self.__db_helper.run_query( 'CREATE TEMPORARY TABLE {} AS SELECT * FROM {} LIMIT 0'.format( quoter(temp_target_name), fully_qualified_table( mysql_db_name_hack(target, self.__destination_conn))), self.__destination_conn) query = 'SELECT {} FROM {}'.format(columns_query, fully_qualified_table(target)) self.__db_helper.copy_rows(self.__source_conn, self.__destination_conn, query, temp_target_name) # filter it down in the target database table_columns = self.__db_helper.get_table_columns( table_name(target), schema_name(target), self.__source_conn) clauses = [ '{} IN (SELECT {} FROM {})'.format( columns_tupled(kc['fk_columns']), columns_joined(kc['target_columns']), fully_qualified_table( mysql_db_name_hack(kc['target_table'], self.__destination_conn))) for kc in relevant_key_constraints ] clauses.extend(upstream_filter_match(target, table_columns)) select_query = 'SELECT * FROM {} WHERE TRUE AND {}'.format( quoter(temp_target_name), ' AND '.join(clauses)) insert_query = 'INSERT INTO {} {}'.format( fully_qualified_table( mysql_db_name_hack(target, self.__destination_conn)), select_query) self.__db_helper.run_query(insert_query, self.__destination_conn) self.__destination_conn.commit() finally: # delete temporary table mysql_temporary = 'TEMPORARY' if config_reader.get_db_type( ) == 'mysql' else '' self.__db_helper.run_query( 'DROP {} TABLE IF EXISTS {}'.format(mysql_temporary, quoter(temp_target_name)), self.__destination_conn) return True