def copy_rows(source, destination, query, destination_table): datatypes = get_table_datatypes(table_name(destination_table), schema_name(destination_table), destination) def template_piece(dt): if dt == '_json': return '%s::json[]' elif dt == '_jsonb': return '%s::jsonb[]' else: return '%s' template = '(' + ','.join([template_piece(dt) for dt in datatypes]) + ')' cursor_name = 'table_cursor_' + str(uuid.uuid4()).replace('-', '') cursor = source.cursor(name=cursor_name) cursor.execute(query) fetch_row_count = 100000 while True: rows = cursor.fetchmany(fetch_row_count) if len(rows) == 0: break # we end up doing a lot of execute statements here, copying data. # using the inner_cursor means we don't log all the noise destination_cursor = destination.cursor().inner_cursor insert_query = 'INSERT INTO {} VALUES %s'.format( fully_qualified_table(destination_table)) execute_values(destination_cursor, insert_query, rows, template) destination_cursor.close() cursor.close() destination.commit()
def __subset_upstream(self, target, processed_tables, relationships): redacted_relationships = redact_relationships(relationships) relevant_key_constraints = list( filter( lambda r: r['target_table'] in processed_tables and r[ 'fk_table'] == target, redacted_relationships)) # this table isn't referenced by anything we've already processed, so let's leave it empty # OR # table was already added, this only happens if the upstream table was also a direct target if len(relevant_key_constraints) == 0 or target in processed_tables: return False temp_target_name = 'subset_temp_' + table_name(target) try: # copy the whole table columns_query = columns_to_copy(target, relationships, self.__source_conn) self.__db_helper.run_query( 'CREATE TEMPORARY TABLE {} AS SELECT * FROM {} LIMIT 0'.format( quoter(temp_target_name), fully_qualified_table( mysql_db_name_hack(target, self.__destination_conn))), self.__destination_conn) query = 'SELECT {} FROM {}'.format(columns_query, fully_qualified_table(target)) self.__db_helper.copy_rows(self.__source_conn, self.__destination_conn, query, temp_target_name) # filter it down in the target database table_columns = self.__db_helper.get_table_columns( table_name(target), schema_name(target), self.__source_conn) clauses = [ '{} IN (SELECT {} FROM {})'.format( columns_tupled(kc['fk_columns']), columns_joined(kc['target_columns']), fully_qualified_table( mysql_db_name_hack(kc['target_table'], self.__destination_conn))) for kc in relevant_key_constraints ] clauses.extend(upstream_filter_match(target, table_columns)) select_query = 'SELECT * FROM {} WHERE TRUE AND {}'.format( quoter(temp_target_name), ' AND '.join(clauses)) insert_query = 'INSERT INTO {} {}'.format( fully_qualified_table( mysql_db_name_hack(target, self.__destination_conn)), select_query) self.__db_helper.run_query(insert_query, self.__destination_conn) self.__destination_conn.commit() finally: # delete temporary table mysql_temporary = 'TEMPORARY' if config_reader.get_db_type( ) == 'mysql' else '' self.__db_helper.run_query( 'DROP {} TABLE IF EXISTS {}'.format(mysql_temporary, quoter(temp_target_name)), self.__destination_conn) return True
def source_db_temp_table(target_table): return 'tonic_subset_' + schema_name(target_table) + '_' + table_name( target_table)
def source_db_temp_table(target_table): return temp_db + '.' + schema_name(target_table) + '_' + table_name( target_table)