def pull_modifications_since(date, table, column, db_pair, cli_args, condition=None, printer=Prindenter()): printer("syncing rows from {}.{} with {} newer than {}".format( db_pair.upstream.args.database, table.name, column, date)) if condition: printer("... where {}".format(condition)) with Indent(printer): with db_pair.upstream.connection.cursor() as upstream_cursor: if condition: newer_than_sql = f'select {table.id_col} from {table.name} where {column} > \'{date}\' and {condition};' else: newer_than_sql = f'select {table.id_col} from {table.name} where {column} > \'{date}\';' newer_than_result = show_do_query(upstream_cursor, newer_than_sql, printer=printer) if newer_than_result: ids_to_sync = [x[table.id_col] for x in newer_than_result] printer("Found {} such rows".format(len(ids_to_sync))) id_lists = Ids.partition(Constants.batch_conditions, ids_to_sync) conditions = [] for ids in id_lists: ids_str = ",".join([str(x) for x in ids]) conditions.append(f"{table.id_col} in ({ids_str})") with Indent(printer): printer("Proceeding in {} batches".format(len(conditions))) for condition in conditions: # dump upstream data mysqldump_data(cli_args.upstream, table.name, condition, printer=printer) # clear old rows from downstream delete = 'delete from {} where {};'.format( table.name, condition) with Connection( db_pair.downstream.args) as downstream_connection: with downstream_connection.cursor() as cursor: show_do_query(cursor, delete, printer=printer) # load new rows into downstream mysqlload(cli_args.downstream, table.name, printer=printer) return True else: printer("No recent modifications found") return False
def is_synced(self, upstream_cursor, downstream_cursor, printer=Prindenter()): with Indent(printer): get_checksum = f'checksum table {self.name};' result = show_do_query(upstream_cursor, get_checksum, printer=printer) upstream_checksum = result[0]['Checksum'] result = show_do_query(downstream_cursor, get_checksum, printer=printer) downstream_checksum = result[0]['Checksum'] if upstream_checksum != downstream_checksum: return False else: return True printer(f"{self.name} is identical on either side")
def md5_row_ranges(cursor, table, condition, granularity, printer=Prindenter()): if granularity <= 1: raise ValueError("Variable granularity scanner called, but a trivial granule size was provided") converted_columns_str = ",".join(table.columns) shortened_condition = pretty_shorten(condition)[:-1] printer(f"[ Fingerprinting {cursor.connection.db}.{table.name} in row-ranges of size {granularity}\n" f" where {table.id_col} in {shortened_condition} ]") with Indent(printer): result = show_do_query(cursor, f""" SELECT MD5(GROUP_CONCAT(row_fingerprint ORDER BY id)) AS range_fingerprint, row_group * {granularity} as range_begin, (row_group + 1) * {granularity} - 1 as range_end FROM (SELECT MD5(CONCAT_WS('|', {converted_columns_str})) as row_fingerprint, FLOOR({table.id_col}/{granularity}) as row_group, {table.id_col} as id FROM {table.name} WHERE {condition} ORDER BY {table.id_col}) as r GROUP BY row_group; """, printer=printer) # organize fingerprints by interval return { Interval(row['range_begin'], row['range_end']) : row['range_fingerprint'] for row in result }
def make_space_downstream(printer): with Connection(db_pair.downstream.args) as downstream_connection: with downstream_connection.cursor() as cursor: if condition: delete = f'delete from {table.name} where {table.id_col} > {table.upstream.max_id} and {condition};' else: delete = f'delete from {table.name} where {table.id_col} > {table.upstream.max_id};' with Indent(printer): result = show_do_query(cursor, delete, printer=printer)
def get_last_touched_date(table, column, db, printer=Prindenter()): printer(f"[ Finding most recent modification date from {db.args.host}.{db.args.database}.{table}.{column} ]") with Indent(printer): with Connection(db.args) as connection: with connection.cursor() as cursor: most_recent_sql = f'select max({column}) from {table};' most_recent = show_do_query(cursor, most_recent_sql, printer=printer)[0][f"max({column})"] printer(f"Found: {most_recent}") return most_recent
def strip_fk(mysql_args, printer=Prindenter()): target_db = cli_args.database with Connection(mysql_args) as connection: with connection.cursor() as cursor: foreign_keys = ''' SELECT CONSTRAINT_NAME, TABLE_NAME FROM INFORMATION_SCHEMA.REFERENTIAL_CONSTRAINTS WHERE CONSTRAINT_SCHEMA = '{}' '''.format(target_db) result = show_do_query(cursor, foreign_keys, printer=printer) for row in result: fk = row['CONSTRAINT_NAME'] table = row['TABLE_NAME'] drop_constraint = ''' ALTER TABLE {} DROP FOREIGN KEY {}; '''.format(table, fk) with Indent(printer): result = show_do_query(cursor, drop_constraint, printer=printer)
def show_create(cursor, table_name, printer=Prindenter()): printer(f"[Extracting creation SQL from {cursor.connection.db}.{table_name}]") with Indent(printer): result = show_do_query(cursor, f""" SHOW CREATE TABLE {table_name}; """, printer=printer) return result[0]['Create Table'].strip()
def create_twin_if_not_exists(upstream_cursor, downstream_cursor, table_name, printer=Prindenter()): printer(f"[Checking for table existence: {downstream_cursor.connection.db}.{table_name}]") with Indent(printer): result = show_do_query(downstream_cursor, f""" SELECT * FROM information_schema.tables WHERE table_schema = '{downstream_cursor.connection.db}' AND table_name = '{table_name}' LIMIT 1; """, printer=printer) if any(result): printer("It exists, moving on") else: printer("It does not exist, creating it") sql = show_create(upstream_cursor, table_name, printer=printer) result = show_do_query(downstream_cursor, sql, printer=printer)
def strip_uk(target_args, printer=Prindenter()): target_db = target_args.database with Connection(target_args) as connection: with connection.cursor() as cursor: unique_keys = ''' SELECT DISTINCT constraint_name, table_name FROM information_schema.table_constraints WHERE constraint_type = 'UNIQUE' AND table_schema = '{}'; '''.format(target_db) result = show_do_query(cursor, unique_keys, printer=printer) for row in result: uk = row['constraint_name'] table = row['table_name'] drop_constraint = ''' DROP INDEX {} ON {}; '''.format(uk, table) with Indent(printer): result = show_do_query(cursor, drop_constraint, printer=printer)
def get_group_concat(cursor, try_set=10000000 * 33, printer=Prindenter()): # 32 bytes for the md5 plus 1 for the comma times a million rows # limiting it here because I'd prefer too many small queries over a few monsters # that ties up the server with no gaps. This may be unnecessarily cautious, go bigger at your own risk. # hell, this is all at your own risk printer("How many rows is {} willing to hash at a time?".format( cursor.connection.host)) with Indent(printer): # try to ask for enough space for 1 million rows at a time printer("Asking for lots lof space...") result = show_do_query( cursor, "set session group_concat_max_len = {};".format(try_set), printer=printer) # but accept what we're given printer("Taking what we can get...") result = show_do_query( cursor, "show variables where Variable_name = 'group_concat_max_len';", printer=printer) max_group_concat_bytes = int(result[0]['Value']) # and see how many rows that is printer("How many of these will fit?") result = show_do_query( cursor, "select length(concat(md5('foo'),',')) as md5_bytes;", printer=printer) md5_bytes = int(result[0]['md5_bytes']) rows = floor(max_group_concat_bytes / md5_bytes) printer("{} is willing to hash {} rows at a time.".format( cursor.connection.host, rows)) return GroupConcat(rows, max_group_concat_bytes)
def pull_foo(db_pair, cli_args, printer=Prindenter()): # grab only the foo token indices that are relevant with db_pair.upstream.connection.cursor() as upstream_cursor: get_ids = '''select id, foo_token_id from foo_ref where name like 'relevant%';''' result = show_do_query(upstream_cursor, get_ids, printer=printer) foo_token_ids = ', '.join([str(x['foo_token_id']) for x in result]) foo_ref_ids = ', '.join([str(x['id']) for x in result]) # dump just those rows mysqldump_data(cli_args.upstream, 'foo_ref', 'id in ({});'.format(foo_ref_ids), printer=printer) mysqldump_data(cli_args.upstream, 'foo_tokens', 'id in ({});'.format(foo_token_ids), printer=printer) # clear old rows with Connection(db_pair.downstream.args) as downstream_connection: with downstream_connection.cursor() as cursor: show_do_query(cursor, 'truncate foo_ref;', printer=printer) show_do_query(cursor, 'truncate foo_tokens;', printer=printer) # load new rows mysqlload(cli_args.downstream, 'foo_ref', printer=printer) mysqlload(cli_args.downstream, 'foo_tokens', printer=printer) printer.append_summary("foo_ref : UP TO DATE for select rows") printer.append_summary("foo_tokens : UP TO DATE for select rows") printer("foo_tokens and foo_ref are up to date where it matters")
def __init__(self, table_name, cursor, id_col, printer=Prindenter()): # Initialize values also found on Twin and that don't disagree between upstream and downstream self.id_col = id_col self.name = table_name # column descriptions with concatentate-friendly modifications self.columns = examine_columns(cursor, table_name, printer=printer) # how many rows? target = f'max({self.id_col})' printer(f"[Finding {target} for {cursor.connection.db}.{self.name}]") query = f'select {target} from {self.name};' with Indent(printer): result = show_do_query(cursor, query, printer=printer) self.max_id = result[0][target] or 0
def fingerprint_groups(cursor, table, top_key, sub_keys, printer=Prindenter()): printer( f"[ Fingerprinting {cursor.connection.db}.{table.name} grouped by {top_key} ]" ) with Indent(printer): all_columns = ",".join(table.upstream.columns) subkey_columns = ",".join(sub_keys) return show_do_query(cursor, f""" SELECT {top_key}, MD5(GROUP_CONCAT({all_columns} ORDER BY {subkey_columns})) AS group_fingerprint FROM {table.name} GROUP BY {top_key}; """, printer=printer)
def md5_rows(cursor, table, condition, granularity, printer=Prindenter()): if granularity > 1: raise ValueError("Individual row scanner called, but a nontrivial row-range size was provided") converted_columns_str = ",".join(table.columns) shortened_condition = pretty_shorten(condition)[:-1] printer(f"[ Fingerprinting each row in {cursor.connection.db}.{table.name}\n" f" where {table.id_col} in {shortened_condition} ]") with Indent(printer): result = show_do_query(cursor, f""" SELECT {table.id_col} as id, MD5(CONCAT_WS('|', {converted_columns_str})) as fingerprint FROM {table.name} WHERE {condition} ORDER BY {table.id_col}; """, printer=printer) return { row[table.id_col] : row['fingerprint'] for row in result }
def examine_columns(cursor, table_name, printer=Prindenter()): printer(f"[Examining Columns on {cursor.connection.db}.{table_name}]") with Indent(printer): result = show_do_query(cursor, f""" SELECT COLUMN_NAME, IS_NULLABLE, COLUMN_TYPE, COLLATION_NAME FROM information_schema.columns WHERE table_schema='{cursor.connection.db}' AND table_name='{table_name}'; """, printer=printer) column_conversions= [] for column in result: # make the column representation concatenate-friendly converted = f"`{column['COLUMN_NAME']}`" if column['IS_NULLABLE'] == 'YES': converted = f"IFNULL({converted}, 'NULL')" if column['COLLATION_NAME'] and column['COLLATION_NAME'] not in ['NULL', 'utf8_general_ci']: converted = f"BINARY {converted}" if 'binary(' in column['COLUMN_TYPE']: converted = f"hex({converted})" # your data may deviate in new and exciting ways # handle them here ... with Indent(printer): printer(converted) column_conversions.append(converted) return column_conversions
def pull_schema(args, upstream_connection, printer=Prindenter()): target_db = args.downstream.database with Connection(args.downstream) as downstream_connection: with downstream_connection.cursor() as cursor: show_tables = 'show tables;' result = show_do_query(cursor, show_tables, printer=printer) table_ct = len(result) if table_ct > 0: printer("{} is a nonempty downstream database. " "If you want me to create a new database in its place, you'll have to drop and create it yourself.".format(target_db)) # if you'd rather I nuke it for you, you're trusting me too much else: tmp_file = 'schema_nofk.sql' # dump schema to a file mysqldump_schema_nofk(args.upstream, tmp_file, printer=printer) # load from a file mysqlload(args.downstream, tmp_file, printer=printer)
def multikey(table, db_pair, cli_args, keycolumns, condition=None, printer=Prindenter()): # given a query result for both up and downstream, sync the rows where check_col differs def group_sync(id_col, check_col, upstream, downstream, made_changes, printer=Prindenter()): group_fingerprints_by_id = {} def populate(name, query_result, key, store): for row in query_result: try: store.setdefault(row[key], {}) store[row[key]][name] = row[check_col] except KeyError: IPython.embed() populate('up', upstream, id_col, group_fingerprints_by_id) populate('down', downstream, id_col, group_fingerprints_by_id) to_delete = [] to_write = [] for id, stream in group_fingerprints_by_id.items(): if 'up' not in stream: to_delete.append(id) elif 'down' not in stream: to_write.append(id) else: if stream['up'] != stream['down']: to_delete.append(id) to_write.append(id) if any(to_write): write_condition = f"{id_col} in ({','.join(map(str, to_write))})" else: write_condition = None if any(to_delete): delete_condition = f"{id_col} in ({','.join(map(str, to_delete))})" else: delete_condition = None if write_condition: printer( f"Found {str(len(to_write))} groups to pull down from upstream" ) made_changes = True mysqldump_data(cli_args.upstream, table.name, write_condition, printer=printer) else: printer(f"Nothing to pull down from upstream") if delete_condition: printer("Making space downstream") made_changes = True with Indent(printer): with Connection( db_pair.downstream.args) as downstream_connection: with downstream_connection.cursor() as cursor: result = show_do_query( cursor, f'delete from {table.name} where {delete_condition};', printer=printer) else: printer(f"Downstream space is open for new data") if write_condition: # load from a file printer("Loading rows") mysqlload(cli_args.downstream, table.name, printer=printer) # group the table by 'top_key' and hash the groups def fingerprint_groups(cursor, table, top_key, sub_keys, printer=Prindenter()): printer( f"[ Fingerprinting {cursor.connection.db}.{table.name} grouped by {top_key} ]" ) with Indent(printer): all_columns = ",".join(table.upstream.columns) subkey_columns = ",".join(sub_keys) return show_do_query(cursor, f""" SELECT {top_key}, MD5(GROUP_CONCAT({all_columns} ORDER BY {subkey_columns})) AS group_fingerprint FROM {table.name} GROUP BY {top_key}; """, printer=printer) made_changes = False # sync based on row cardinality ids = f"SELECT {keycolumns[0]}, count(*) as group_size FROM {table.name} group by 1;" with db_pair.upstream.connection.cursor() as upstream_cursor: upstream = show_do_query(upstream_cursor, ids, printer=printer) with Connection(db_pair.downstream.args) as downstream_connection: with downstream_connection.cursor() as downstream_cursor: downstream = show_do_query(downstream_cursor, ids, printer=printer) printer( f"[ Using {keycolumns[0]} as a key to sync missing rows on table {table.name} ]" ) with Indent(printer): group_sync(keycolumns[0], 'group_size', upstream, downstream, made_changes, printer=printer) # if changes persist, sync based on row contents with db_pair.upstream.connection.cursor() as upstream_cursor: with Connection(db_pair.downstream.args) as downstream_connection: with downstream_connection.cursor() as downstream_cursor: if table.is_synced(upstream_cursor, downstream_cursor, printer=printer): return made_changes else: upstream = fingerprint_groups(upstream_cursor, table, keycolumns[0], keycolumns[1:], printer=printer) downstream = fingerprint_groups(downstream_cursor, table, keycolumns[0], keycolumns[1:], printer=printer) printer( f"[ Using {keycolumns[0]} as a key to find mismatched data on table {table.name} ]" ) with Indent(printer): group_sync(keycolumns[0], 'group_fingerprint', upstream, downstream, made_changes, printer=printer) return made_changes
def sync_schema(upstream_cursor, downstream_cursor, table_name, printer=Prindenter()): # collect schema changes for reporting report = ColumnChanges([], [], []) printer("[Examining up and downstream schemas for {}]".format(table_name)) with Indent(printer): describe = 'describe {};'.format(table_name) up = TableSchema(show_do_query(upstream_cursor, describe, printer=printer)) down = TableSchema(show_do_query(downstream_cursor, describe, printer=printer)) up_columns = { x.field : x for x in up.columns } down_columns = { x.field : x for x in down.columns } add = { k:v for k,v in up_columns.items() if k not in down_columns } delete = { k:v for k,v in down_columns.items() if k not in up_columns } upstream_creates_q = "show create table {}".format(table_name) upstream_creates = show_do_query(upstream_cursor, upstream_creates_q) if add: with Indent(printer): printer("Adding {} columns downstream".format(len(add))) for new_col, schema in add.items(): create = next(filter(lambda x : re.search(new_col, x), upstream_creates[0]['Create Table'].split('\n'))).strip(',').strip() add_query = "ALTER TABLE {} ADD COLUMN {} ".format(table_name, create) if schema.after: add_query += "AFTER {}".format(schema.after) else: add_query += "FIRST" add_query += ";" show_do_query(downstream_cursor, add_query) # TODO: import the new table values explicitly, rather than letting the sync catch them report.added.append(new_col) if delete: with Indent(printer): printer("Deleting {} columns downstream".format(len(delete))) for removed_col in delete.keys(): drop_query = "ALTER TABLE {} DROP COLUMN {};".format(table_name, removed_col) show_do_query(downstream_cursor, drop_query) report.deleted.append(removed_col) down = TableSchema(show_do_query(downstream_cursor, describe, printer=printer)) down_columns = { x.field : x for x in down.columns } # check for necessary modifications in upstream column order for up in up_columns.values(): down = next(filter(lambda x : x.field == up.field, down_columns.values())) if up.field != down.field: raise ValueError("Unable to compare schemas for table {} column {} != {}".format( table_name, up.field, down.field)) else: with Indent(printer): if up.after != down.after or up.default != down.default or up.null != down.null or up.type != down.type: printer("Modifying column: {}".format(up.field)) printer("Old:\n {}".format(down)) printer("New:\n {}".format(up)) modify = next(filter(lambda x : re.search(up.field, x), upstream_creates[0]['Create Table'].split('\n'))).strip(',').strip() modify_query = "ALTER TABLE {} MODIFY COLUMN {} ".format(table_name, modify) if up.after: modify_query += "AFTER {}".format(up.after) else: modify_query += "FIRST" modify_query += ";" show_do_query(downstream_cursor, modify_query) column_report = OrderedDict() column_report["column"] = up.field column_report["from"] = up.__dict__ column_report["to"] = down.__dict__ report.modified.append(column_report) else: printer("Column: {} has no schema changes".format(up.field)) return report
def general(table, zoom_levels, db_pair, cli_args, id_col='id', batch_rows=Constants.batch_rows, condition=None, printer=Prindenter()): # prepare for recursion if not already in it if type(table) == str: printer("[Examining table: {}]".format(table)) with Indent(printer): try: table = pre_general(table, db_pair, cli_args, id_col, batch_rows, condition=condition, printer=printer) except sh.ErrorReturnCode_1 as err: # handle schema mismatches with a sledgehammer # TODO: allow user to provide path to migration scripts, # run outstanding ones if they show up in migration_tracker if "Column count doesn't match" in str(err): printer("Upstream schema differs, pulling it down") with Indent(printer): # get upstream schema filename = 'newschema_{}.sql'.format(table) mysqldump_schema_nofk(cli_args.upstream, filename, restrict_to_table=table, printer=printer) # drop downstream table drop = 'drop table {};'.format(table) with Connection(db_pair.downstream.args ) as downstream_connection: with downstream_connection.cursor( ) as downstream_cursor: show_do_query(downstream_cursor, drop, printer=printer) # recreate downstream table mysqlload(cli_args.downstream, filename, printer=printer) # try again printer("[New schema loaded, downstream table is empty]") table = pre_general(table, db_pair, cli_args, id_col, condition=condition, printer=printer) else: raise if type(zoom_levels) == list: # set up for recursion if table.needs_work: printer( "Sync: 'general' received magnification list instead of zoom_level map, building zoom_level map...", end='') with Indent(printer): # prepare the zoom-level map zoom_levels = SortedDict({x: None for x in zoom_levels}) # append the outermost zoom level (completed in general) zoom_levels[table.upstream.max_id] = [ Ids.Interval(0, table.upstream.max_id) ] else: printer("Sync: 'general' finished early: presync was sufficient") return printer("done\n") # begin recursion printer("[Sync: 'general' top-level recursion]") with Indent(printer): return general(table, zoom_levels, db_pair, cli_args, condition=condition, printer=printer) # if control gets this far, recursion has begun granularity = None scopes = None # examine the scope map by decreasing magnification # find the transition from unknowns to knowns for ((smaller_granularity, smaller_scope), (larger_granularity, larger_scope)) \ in reversed(list(zip(zoom_levels.items(), zoom_levels.items()[1:]))): if not smaller_scope: scopes = larger_scope # we'll be filling these out granularity = smaller_granularity # by breaking them into pieces this big break if not scopes: printer( "Zoom-level map fully populated, no more 'general' recursions will follow" ) conditions = [] final_size = zoom_levels.keys()[0] final_scopes = list(zoom_levels.values()[0]) final_scopes.sort() if final_size <= 1 and type(final_scopes[0]) == int: printer("Scanned down to individual rows") row_lists = Ids.partition(Constants.batch_fingerprints, final_scopes) for rows in row_lists: conditions.append("{} in ({})".format( table.id_col, ",".join([str(x) for x in rows]))) elif final_size > 1 and isinstance(final_scopes[0], Ids.Interval): printer("Scanned down to row-ranges of size {}".format(final_size)) interval_lists = Ids.partition(Constants.batch_fingerprints, final_scopes) conditions = [] for intervals in interval_lists: conditions.append(" OR ".join([ "{} BETWEEN {} AND {}".format(table.id_col, i.start, i.end) for i in intervals ])) else: raise ValueError( "Can't decide whether to transfer rows, or row-ranges") printer("[Transfer proceeding in {} batches]".format(len(conditions))) with Indent(printer): for condition in conditions: # dump upstream data mysqldump_data(cli_args.upstream, table.name, condition, printer=printer) # clear old rows from downstream delete = 'delete from {} where {};'.format( table.name, condition) with Connection( db_pair.downstream.args) as downstream_connection: with downstream_connection.cursor() as cursor: show_do_query(cursor, delete, printer=printer) # load new rows into downstream mysqlload(cli_args.downstream, table.name, printer=printer) with Connection(db_pair.downstream.args) as downstream_connection: with downstream_connection.cursor() as downstream_cursor: with db_pair.upstream.connection.cursor() as upstream_cursor: table.is_synced_warn(upstream_cursor, downstream_cursor, message='(after general sync)', printer=printer) table.try_sync_schema(upstream_cursor, downstream_cursor, throw=True, printer=printer) # if we found a row with unpopulated scopes, then we have more scanning to do else: printer( "[Given {} larger-granules, making smaller granules of size {} and fingerprinting them]" .format(len(scopes), granularity)) next_scopes = [] with Indent(printer): with Connection(db_pair.downstream.args) as downstream_connection: with downstream_connection.cursor() as downstream_cursor: with db_pair.upstream.connection.cursor( ) as upstream_cursor: # new sessions, reset group_concat (default is oddly low) db_pair.reup_maxes(downstream_cursor, upstream_cursor, printer=printer) #for scope in scopes: # next_scopes += list(Db.find_diffs(upstream_cursor, downstream_cursor, table, scope, granularity, # printer=printer)) # rather than making a round trip for each one, lets do them all at once next_scopes += list( Db.find_diffs(upstream_cursor, downstream_cursor, table, scopes, granularity, condition=condition, printer=printer)) printer( '' ) # Db.find_diffs ends without a newline... add one # if no ranges were found to contain diffs if len( next_scopes ) == 0: # note that any([0]) is False, but len([0]) == 0 is True # we want the latter, else we ignore row 0 message = textwrap.dedent(""" Found no ranges with diffs. Nothing to do. If the tables were truly identical, TABLE CHECKSUM would have prevented sync from gettin this far. Perhaps some columns were ignored during the scan? (e.g. timestamps, as an ugly hack to avoid thinking about time zones) """) printer(message) printer.append_summary( "{} : IDENTICAL? (TABLE CHECKSUM failed but a custom MD5 scan found no diffs)" .format(table.name)) # if no ranges were found to contain diffs else: zoom_levels[granularity] = next_scopes printer("[Another 'general' recursion]") with Indent(printer): return general(table, zoom_levels, db_pair, cli_args, condition=condition, printer=printer)
def set_group_concat(cursor, value, printer=Prindenter()): return show_do_query( cursor, "set session group_concat_max_len = {};".format(value), printer=printer)
def group_sync(id_col, check_col, upstream, downstream, made_changes, printer=Prindenter()): group_fingerprints_by_id = {} def populate(name, query_result, key, store): for row in query_result: try: store.setdefault(row[key], {}) store[row[key]][name] = row[check_col] except KeyError: IPython.embed() populate('up', upstream, id_col, group_fingerprints_by_id) populate('down', downstream, id_col, group_fingerprints_by_id) to_delete = [] to_write = [] for id, stream in group_fingerprints_by_id.items(): if 'up' not in stream: to_delete.append(id) elif 'down' not in stream: to_write.append(id) else: if stream['up'] != stream['down']: to_delete.append(id) to_write.append(id) if any(to_write): write_condition = f"{id_col} in ({','.join(map(str, to_write))})" else: write_condition = None if any(to_delete): delete_condition = f"{id_col} in ({','.join(map(str, to_delete))})" else: delete_condition = None if write_condition: printer( f"Found {str(len(to_write))} groups to pull down from upstream" ) made_changes = True mysqldump_data(cli_args.upstream, table.name, write_condition, printer=printer) else: printer(f"Nothing to pull down from upstream") if delete_condition: printer("Making space downstream") made_changes = True with Indent(printer): with Connection( db_pair.downstream.args) as downstream_connection: with downstream_connection.cursor() as cursor: result = show_do_query( cursor, f'delete from {table.name} where {delete_condition};', printer=printer) else: printer(f"Downstream space is open for new data") if write_condition: # load from a file printer("Loading rows") mysqlload(cli_args.downstream, table.name, printer=printer)