def pull_schema(cli_args, local_args, remote_connection, printer=Prindenter()): target_db = local_args.database with LocalConnection(local_args) as local_connection: with local_connection.cursor() as cursor: show_tables = 'show tables;' result = show_do_query(cursor, show_tables, printer=printer) table_ct = len(result) if table_ct > 0: printer( "{} is a nonempty local database. " "If you want me to create a new database in its place, you'll have to drop and create it yourself." .format(target_db)) # if you'd rather I nuke it for you, you're trusting me too much else: tmp_file = 'schema_nofk.sql' # dump schema to a file mysqldump_schema_nofk_remote(cli_args, tmp_file, printer=printer) # load from a file mysqlload_local(cli_args, tmp_file, printer=printer)
def get_steps(cli_args, local_args, remote_connection, printer=Prindenter()): return { 'foo_tokens': lambda: pull_foo( cli_args, local_args, remote_connection, printer=printer), # 'foo' gets it's own sync function. # It pulls only a subset of `foo_ref` and then pulls only the subset of `foo_token`that is referenced by `foo_ref`. # It always clobbers the entire target table with the new data. 'foo_ref': None, # This table also handled by pull_foo 'bar': lambda: general_sync('bar', 15, cli_args, local_args, remote_connection, printer=printer), # 'bar' experiences INSERTs, DELETEs, and UPDATEs, so it needs a full sync # gerenal_sync first calls pull_missing_id, so any newly added rows are present in the target. # Then it partitions the table based on id-range and compares MD5 hashes of each partition, and # then it transfers whichever partitions contained changes. # Based on its size, we'll sync it in batches of 15 # too large means we spend less time finding the changes and more time moving data # too small means we spend more time finding the changes and less time moving data # consider the relative value of network bandwith vs cpu time in your own case 'baz': lambda: pull_missing_ids( 'baz', cli_args, local_args, remote_connection, printer=printer), # this table only experiences INSERTs, so we can just sync via max(id) }
def md5_row_range(cursor, table_name, column_conversions, interval, id_col='id', printer=Prindenter()): printer("[Fingerprinting " + cursor.connection.db + ".{table_name} across rows where {id_col} in {interval}]".format( **vars())) with Indent(printer): # concat-friendly conversions for the target table converted_columns_str = ",".join(column_conversions) # hash the row-range start = interval.start end = interval.end condition = "{id_col} >= {start} AND {id_col} < {end}".format(**vars()) result = show_do_query(cursor, """ SELECT MD5(GROUP_CONCAT(row_fingerprints)) AS range_fingerprint from (SELECT MD5(CONCAT({})) as row_fingerprints FROM {} WHERE {} ORDER BY {}) as r; """.format(converted_columns_str, table_name, condition, id_col), printer=printer) return result[0]['range_fingerprint']
def get_max_id(cursor, table_name, id_col='id', printer=Prindenter()): target = 'max({})'.format(id_col) get_max_id = 'select {} from {};'.format(target, table_name) result = show_do_query(cursor, get_max_id, printer=printer) return result[0][target]
def pull_foo(cli_args, local_args, remote_connection, printer=Prindenter()): # grab only the foo token indices that are relevant with remote_connection.cursor() as remote_cursor: get_ids = '''select id, foo_token_id from foo_ref where name like 'relevant%';''' result = show_do_query(remote_cursor, get_ids, printer=printer) foo_token_ids = ', '.join([str(x['foo_token_id']) for x in result]) foo_ref_ids = ', '.join([str(x['id']) for x in result]) # dump just those rows mysqldump_data_remote(cli_args, 'foo_ref', 'id in ({});'.format(foo_ref_ids), printer=printer) mysqldump_data_remote(cli_args, 'foo_tokens', 'id in ({});'.format(foo_token_ids), printer=printer) # clear old rows with LocalConnection(local_args) as local_connection: with local_connection.cursor() as cursor: show_do_query(cursor, 'truncate foo_ref;', printer=printer) show_do_query(cursor, 'truncate foo_tokens;', printer=printer) # load new rows mysqlload_local(cli_args, 'foo_ref', printer=printer) mysqlload_local(cli_args, 'foo_tokens', printer=printer) printer("foo_tokens and foo_ref are up to date where it matters")
def warn_if_not_equal(remote_cursor, local_cursor, table_name, printer=Prindenter()): if is_equal(remote_cursor, local_cursor, table_name, printer): printer("{} is identical on either side".format(table_name)) else: printer("WARNING: {} differs, even after sync!".format(table_name))
def main(args): printer = Prindenter(indent=0) printer('Pulling schema from {} to localhost'.format(args.remote_host)) # defer local connection setup local_args = LocalArgs(args.local_user, args.local_password, args.local_database, args.local_socket) # set up remote connection remote_args = RemoteArgs(args.remote_host, args.remote_user, args.remote_password, args.remote_database, args.cipher) with RemoteConnection(remote_args) as remote_connection, Indent(printer): pull_schema(args, local_args, remote_connection, printer=printer) printer('Done')
def find_diff_intervals(remote_cursor, local_cursor, table_name, intervals, id_col='id', printer=Prindenter()): has_diffs = [] printer("Examining table formats on either side]") with Indent(printer): local_columns = examine_columns(local_cursor, table_name, printer=printer) remote_columns = examine_columns(remote_cursor, table_name, printer=printer) printer("[Scanning intervals for changes]") with Indent(printer): for interval in intervals: printer("[Scanning {}]".format(interval)) with Indent(printer): local_fingerprint = md5_row_range(local_cursor, table_name, local_columns, interval, id_col=id_col, printer=printer) remote_fingerprint = md5_row_range(remote_cursor, table_name, remote_columns, interval, id_col=id_col, printer=printer) if local_fingerprint == remote_fingerprint: printer("{} NO SYNC NEEDED\n".format(interval)) else: printer("{} NEEDS SYNC\n".format(interval)) has_diffs.append(interval) return has_diffs
def is_equal(remote_cursor, local_cursor, table_name, printer=Prindenter()): printer("[Checking table equality for {}]".format(table_name)) with Indent(printer): get_checksum = 'checksum table {};'.format(table_name) result = show_do_query(remote_cursor, get_checksum, printer=printer) remote_checksum = result[0]['Checksum'] result = show_do_query(local_cursor, get_checksum, printer=printer) local_checksum = result[0]['Checksum'] if remote_checksum != local_checksum: printer("WARNING: {} differs, even after sync!".format(table_name)) return False else: return True printer("{} is identical on either side".format(table_name))
def main(args): printer = Prindenter(indent=0) printer('Syncing {} to localhost'.format(args.remote_host)) # defer local connection setup local_args = LocalArgs(args.local_user, args.local_password, args.local_database, args.local_socket) # set up remote connection remote_args = RemoteArgs(args.remote_host, args.remote_user, args.remote_password, args.remote_database) if hasattr(args, 'cipher'): remote_args = RemoteArgs(args.remote_host, args.remote_user, args.remote_password, args.remote_database, cipher=args.cipher) else: remote_args = RemoteArgs(args.remote_host, args.remote_user, args.remote_password, args.remote_database) printer('connecting with:') printer(remote_args.__dict__) with RemoteConnection(remote_args) as remote_connection, Indent(printer): # do the sync-steps for each table in the slice for table_name, sync_func in get_steps(args, local_args, remote_connection, printer).items(): printer('[Table: {}]'.format(table_name)) with Indent(printer): if sync_func: sync_func() printer("") else: with Indent(printer): printer("skipped explicitly by slice definition") printer("") printer('Done')
def get_max_md5_rows(cursor, try_set=1000000 * 33, printer=Prindenter()): # 32 bytes for the md5 plus 1 for the comma times a million rows # limiting it here because I'd prefer too many small queries over a few monsters # that ties up the server with no gaps. This may be unnecessarily cautious, go bigger at your own risk. # hell, this is all at your own risk printer("[How many rows is {} willing to hash at a time?]".format( cursor.connection.host)) with Indent(printer): # try to ask for enough space for 1 million rows at a time printer("Asking for lots lof space...") result = show_do_query( cursor, "set session group_concat_max_len = {};".format(try_set), printer=printer) # but accept what we're given printer("Taking what we can get...") result = show_do_query( cursor, "show variables where Variable_name = 'group_concat_max_len';", printer=printer) max_group_concat_bytes = int(result[0]['Value']) # and see how many rows that is printer("How many of these will fit?") result = show_do_query( cursor, "select length(concat(md5('foo'),',')) as md5_bytes;", printer=printer) md5_bytes = int(result[0]['md5_bytes']) rows = floor(max_group_concat_bytes / md5_bytes) printer("{} rows".format(rows)) return rows
def examine_columns(cursor, table_name, printer=Prindenter()): printer("[Examining Columns on {}.{}]".format(cursor.connection.db, table_name)) with Indent(printer): result = show_do_query(cursor, """ SELECT COLUMN_NAME, IS_NULLABLE, COLUMN_TYPE, COLLATION_NAME FROM information_schema.columns WHERE table_schema='{}' AND table_name='{}'; """.format(cursor.connection.db, table_name), printer=printer) column_conversions = [] for column in result: # make the column representation concatenate-friendly converted = column['COLUMN_NAME'] if column['IS_NULLABLE'] == 'YES': converted = "IFNULL({}, 'NULL')".format(converted) if column['COLLATION_NAME'] and column['COLLATION_NAME'] not in [ 'NULL', 'utf8_general_ci' ]: converted = "BINARY {}".format(converted) # your data may deviate in new and exciting ways # handle them here ... with Indent(printer): printer(converted) column_conversions.append(converted) return column_conversions
def general_sync(table_name, interval_size, cli_args, local_args, remote_connection, printer=Prindenter()): # Check to see if work needs to be done with LocalConnection(local_args) as local_connection: with local_connection.cursor() as local_cursor: with remote_connection.cursor() as remote_cursor: if is_equal(remote_cursor, local_cursor, table_name, printer): printer( "{} is identical on either side".format(table_name)) return printer("[Syncing new rows for table {}]".format(table_name)) with Indent(printer): # get any new rows max_id = pull_missing_ids(table_name, cli_args, local_args, remote_connection, printer=printer) if is_equal(remote_cursor, local_cursor, table_name, printer): printer( "{} is identical on either side".format(table_name)) return printer("[Scanning for diffs in table {}]".format(table_name)) with Indent(printer): # find which id-ranges have changes that need syncing remote_max = get_max_md5_rows(remote_cursor, printer=printer) local_max = get_max_md5_rows(local_cursor, printer=printer) interval_size = min(remote_max, local_max, interval_size) intervals = make_intervals(0, max_id, interval_size) diff_intervals = find_diff_intervals(remote_cursor, local_cursor, table_name, intervals, printer=printer) printer("[Transferring for diffs in table {}]".format(table_name)) with Indent(printer): for interval in diff_intervals: condition = 'id >= {} and id <= {}'.format(interval.start, interval.end) # dump remote data mysqldump_data_remote(cli_args, table_name, condition, printer=printer) # clear old rows from local delete = 'delete from {} where {};'.format(table_name, condition) with LocalConnection(local_args) as local_connection: with local_connection.cursor() as cursor: show_do_query(cursor, delete, printer=printer) # load new rows into local mysqlload_local(cli_args, table_name, printer=printer) # warn if not equal with LocalConnection(local_args) as local_connection: with local_connection.cursor() as local_cursor: with remote_connection.cursor() as remote_cursor: warn_if_not_equal(remote_cursor, local_cursor, table_name, printer=printer)
def pull_missing_ids(table_name, cli_args, local_args, remote_connection, printer=Prindenter()): target = 'max(id)' get_max_id = 'select {} from {};'.format(target, table_name) # main() doesn't pass-in a local connection because if the connection is held open # then mysqlload_local (below) can't make its own separate connection with LocalConnection(local_args) as local_connection: with local_connection.cursor() as cursor: printer("Finding max(id) for {}.{}".format(table_name, cursor.connection.db)) with Indent(printer): result = show_do_query(cursor, get_max_id, printer=printer) begin = result[0][target] or 0 with remote_connection.cursor() as cursor: printer("Finding max(id) for {}.{}".format(table_name, cursor.connection.db)) with Indent(printer): result = show_do_query(cursor, get_max_id, printer=printer) end = result[0][target] or 0 if end == begin: printer("Nothing to sync") # check for local changes beyond max_id for remote db and clobber them (this is a one-way sync) elif end < begin: with LocalConnection(local_args) as local_connection: with local_connection.cursor() as cursor: printer("Downstream db has more rows, deleting them.") delete = 'delete from {} where id > {};'.format( table_name, end) with Indent(printer): result = show_do_query(cursor, delete, printer=printer) else: printer("Upstream db has more rows, pulling them.") # dump to a file if begin == None: # if the target table is empty, dump everything mysqldump_data_remote_batches(cli_args, table_name, batch_rows, end, min_id=begin, printer=printer) else: # otherwise, dump just the rows that aren't in the target mysqldump_data_remote_batches( cli_args, table_name, batch_rows, # batch size end, # max id min_id=begin, condition='id > {}'.format(begin), printer=printer) # load from a file mysqlload_local(cli_args, table_name, printer=printer) with LocalConnection(local_args) as local_connection: with local_connection.cursor() as local_cursor: with remote_connection.cursor() as remote_cursor: warn_if_not_equal(remote_cursor, local_cursor, table_name, printer=printer) # return max id return end