def test_run(self): self.mox.StubOutWithMock(exporter_mysql, 'time') exporter_mysql.time.sleep(mox.IgnoreArg()) exporter_mysql.time.time().AndReturn(42) self.chunk.update() self.mox.StubOutWithMock(db, 'shard_connection') conn = self.mox.CreateMockAnything() db.shard_connection('src', read=True).AndReturn(self.context(conn)) self.mox.StubOutWithMock(subprocess, 'Popen') proc = self.mox.CreateMockAnything() subprocess.Popen(mox.IgnoreArg(), shell=True, stdin=subprocess.PIPE).AndReturn(proc) proc.stdin = self.mox.CreateMockAnything() proc.stdin.close() proc.wait() proc.returncode = 0 self.mox.StubOutWithMock(db, 'cursor') cur = self.mox.CreateMockAnything() db.cursor(conn).AndReturn(self.context(cur)) cur.execute(mox.IgnoreArg(), mox.IgnoreArg()) cur.rowcount = 1 self.mox.StubOutWithMock(exporter_mysql.streamer, 'queue_stream_chunk') exporter_mysql.streamer.queue_stream_chunk(self.config) self.chunk.update() self.chunk.update_status('exported', where_status='exporting') proc = self.mox.CreateMockAnything() subprocess.Popen(mox.IgnoreArg(), shell=True, stdin=subprocess.PIPE).AndReturn(proc) proc.stdin = self.mox.CreateMockAnything() proc.stdin.close() proc.wait() self.mox.ReplayAll() self.worker._run()
def main(): print('Generating data') with db.shard_connection('mysql', read=False) as conn: for host_id in xrange(1, 11): print('.') name = str(uuid.uuid4()) time = datetime.now() - timedelta(days=365) with db.cursor(conn) as cur: cur.execute( 'INSERT INTO shinkansen.host (id, name, created) VALUES (%s, %s, %s)', (host_id, name, time) ) while time < datetime.now(): status = 'up' if random.random() < 0.9 else 'down' with db.cursor(conn) as cur: cur.execute( 'INSERT INTO shinkansen.host_checkin (host_id, time, status) VALUES (%s, %s, %s)', (host_id, time, status) ) if status == 'up': for metric in ['cpu', 'mem', 'disk']: with db.cursor(conn) as cur: cur.execute( 'INSERT INTO shinkansen.host_metric (host_id, name, time, value) ' 'VALUES (%s, %s, %s, %s)', (host_id, metric, time, random.random()) ) time += timedelta(days=1) conn.commit() print('done')
def test_get_count(self): cur = self.mox.CreateMockAnything() self.mox.StubOutWithMock(db, 'cursor') db.cursor(None).AndReturn(self.context(cur)) cur.execute(mox.IgnoreArg()) cur.fetchone().AndReturn((42,)) self.mox.ReplayAll() self.assertEqual(42, shinkansen_trim_crate_data.get_count(None, ''))
def get_column_metadata(self, conn): columns = [] with db.cursor(conn, dictionary=True) as cur: cur.execute( 'SELECT * FROM information_schema.columns WHERE schema_name = %(?)s AND table_name = %(?)s ' % { '?': conn.PARAMETER_PLACEHOLDER }, (self.c.source_schema.lower(), self.c.table_config.table_name.lower()) ) column_recs = cur.fetchall() pk_cols = set() with db.cursor(conn) as cur: cur.execute( 'SELECT constraint_name FROM information_schema.table_constraints ' 'WHERE constraint_type = %(?)s AND schema_name = %(?)s AND table_name = %(?)s' % { '?': conn.PARAMETER_PLACEHOLDER }, ('PRIMARY_KEY', self.c.source_schema.lower(), self.c.table_config.table_name.lower()) ) for (constraint_name,) in cur.fetchall(): # constraint_name is a list of the columns in the key for column in constraint_name: pk_cols.add(column.lower()) for column in column_recs: col = db.Column( column['column_name'], self.TYPE_MAP[column['data_type']], column['column_name'].lower() in pk_cols, ignore=(column['column_name'].lower() in self.c.table_config.ignore_columns), ) columns.append(col) self.column_map[col.lname] = col if ( self.c.migration_type == orm.MigrationType.DELTA or self.c.chunk_migration_type == orm.ChunkMigrationType.DIRECT ): # Check the destination for the primary key columns as well since the schemas may be different with db.shard_connection(self.c.destination_shard, read=True) as conn: primary_key = conn.get_table_primary_key_columns( self.c.destination_schema, self.c.table_config.table_name ) for col_name in primary_key: if col_name.lower() not in self.column_map: raise UnrecoverableError( 'Primary key column in destination does not exist in source ' 'table=%s column=%s source=%s destination=%s' % ( self.c.table_config.table_name, col_name, self.c.source_shard, self.c.destination_shard)) self.column_map[col_name.lower()].is_primary_key = True self.c.columns = columns
def upsert(self, conn, records, _recursed=False): with db.cursor(conn) as cur: sql = ( 'INSERT INTO %(schema)s.%(table)s (%(columns)s) VALUES (%(placeholders)s) ' 'ON DUPLICATE KEY UPDATE %(sets)s' ) % { 'schema': self.c.destination_schema, 'table': self.c.table_config.table_name, 'columns': ', '.join(col.name for col in self.c.export_columns), 'placeholders': ', '.join( conn.column_insert_sql(col) for col in self.c.export_columns ) % {'?': conn.PARAMETER_PLACEHOLDER}, 'sets': ', '.join( '%s = VALUES(%s)' % (col.name, col.name) for col in self.c.export_columns if not col.is_primary_key ), } try: results = cur.executemany(sql, records) except Exception, e: if config.IGNORE_CONSTRAINT_FAILURES and 'foreign key constraint fails' in str(e) and not _recursed: self.log('Foreign key constraint error, trying one by one') for record in records: try: self.upsert(conn, [record], _recursed=True) except Exception, e: self.log('Record failed insert due to %r: %r', e, record) return else:
def _import_to_crate(self): with db.shard_connection(self.c.destination_shard, read=True) as conn: self.log('Starting import') with db.cursor(conn) as cur: sql = "COPY %s.%s FROM '%s'" % ( self.c.destination_schema, self.c.table_config.table_name, self.c.import_filename) cur.execute(sql) self.c.num_records_imported = cur.rowcount conn.commit()
def test_get_table_metadata_full(self): self.mox.StubOutWithMock(queuer.db, 'shard_connection') conn = self.mox.CreateMockAnything() queuer.db.shard_connection('src', read=True).AndReturn(self.context(conn)) self.mox.StubOutWithMock(queuer.db, 'cursor') cur = self.mox.CreateMockAnything() conn.get_current_timestamp().AndReturn(1) self.tt.update() self.mox.StubOutWithMock(queuer.QueueChunksWorker, 'get_column_metadata') queuer.QueueChunksWorker.get_column_metadata(conn) db.cursor(conn).AndReturn(self.context(cur)) cur.execute(mox.IgnoreArg(), mox.IgnoreArg()) cur.fetchone().AndReturn((1, 2, 3)) self.mox.ReplayAll() self.config.migration_type = orm.MigrationType.FULL worker = queuer.QueueChunksWorker(self.config, self.redis) worker.get_table_metadata()
def main(): with db.shard_connection('crate', read=True) as conn: print('Creating shinkansen.host') with db.cursor(conn) as cur: cur.execute('DROP TABLE IF EXISTS shinkansen.host') with db.cursor(conn) as cur: cur.execute(''' CREATE TABLE shinkansen.host ( id INT PRIMARY KEY, name String, created TIMESTAMP )''') print('Creating shinkansen.host_checkin') with db.cursor(conn) as cur: cur.execute('DROP TABLE IF EXISTS shinkansen.host_checkin') with db.cursor(conn) as cur: cur.execute('''CREATE TABLE shinkansen.host_checkin ( host_id INT, time TIMESTAMP, status String )''') print('Creating shinkansen.host_metric') with db.cursor(conn) as cur: cur.execute('DROP TABLE IF EXISTS shinkansen.host_metric') with db.cursor(conn) as cur: cur.execute('''CREATE TABLE shinkansen.host_metric ( host_id INT, name String, time TIMESTAMP, value String )''')
def test_get_table_metadata_delta(self): self.config.delta_start = 12345 self.mox.StubOutWithMock(queuer.db, 'shard_connection') conn = self.mox.CreateMockAnything() queuer.db.shard_connection('src', read=True).AndReturn(self.context(conn)) self.mox.StubOutWithMock(queuer.db, 'cursor') cur = self.mox.CreateMockAnything() conn.get_current_timestamp().AndReturn(1) self.tt.update() self.mox.StubOutWithMock(queuer.QueueChunksWorker, 'get_column_metadata') queuer.QueueChunksWorker.get_column_metadata(conn) conn.from_unixtime_value(12345).AndReturn('2042T') db.cursor(conn).AndReturn(self.context(cur)) cur.execute(mox.IgnoreArg(), mox.IgnoreArg()) cur.fetchone().AndReturn((1, 2, 3)) self.mox.ReplayAll() self.config.migration_type = orm.MigrationType.DELTA worker = queuer.QueueChunksWorker(self.config, self.redis) worker.column_map = {'a': {}, 'b': {}, 'c': {}} worker.get_table_metadata()
def trim_data(conn, sql, trim_time_str, shard_config, table_name, delete, partition_val): num_rows = get_count(conn, sql) if delete: print('Deleting %r from %s.%s older than %s%s' % ( num_rows, shard_config['default_schema_name'], table_name, trim_time_str, '' if partition_val is None else ' for partition %s' % (partition_val,) )) with db.cursor(conn) as cur: cur.execute('DELETE ' + sql) else: print('%r records to delete from %s.%s older than %s%s' % ( num_rows, shard_config['default_schema_name'], table_name, trim_time_str, '' if partition_val is None else ' for partition %s' % (partition_val,) ))
def get_table_metadata(self): with db.shard_connection(self.c.source_shard, read=True) as conn: self.table.source_start_time = conn.get_current_timestamp() self.table.start_time = int(time.time() * 1000) self.table.status = 'in_progress' self.table.update() self.get_column_metadata(conn) self.c.where_clauses, self.c.where_values = worker.generate_where(conn, self.c, self.c.table_config) if ( self.c.migration_type == orm.MigrationType.DELTA and self.c.table_config.join ): # TODO: come up with a different way to do deltas for crate with a join clause. We don't need the chunking if we do it via json export if self.c.source_type == 'crate': self.log_error( 'The %s table specifies a join clause but joins are not supported for crate due to lack of ' 'aggregation support for JOIN queries. This table will not have any delta migrations ' 'performed.', self.c.table_config.table_name ) return join = self.c.table_config.join % {'schema': self.c.source_schema} else: join = '' sql = ( 'SELECT COUNT(*), MIN(%(chunk_col)s), MAX(%(chunk_col)s) ' 'FROM %(schema)s.%(table)s %(table_alias)s %(join)s %(where)s' ) % { 'chunk_col': self.c.table_config.chunk_col, 'schema': self.c.source_schema, 'table': self.c.table_config.table_name, 'table_alias': self.c.table_config.table_alias, # We only need the join clause for delta and direct currently 'join': join, 'where': (' WHERE ' + (' AND '.join(self.c.where_clauses))) if self.c.where_clauses else '' } with db.cursor(conn) as cur: cur.execute(sql % {'?': conn.PARAMETER_PLACEHOLDER}, self.c.where_values) (self.num_rows, self.min_id, self.max_id) = cur.fetchone() self.log('num_rows=%r min_id=%r max_id=%r', self.num_rows, self.min_id, self.max_id)
def _import_to_mysql(self): with db.shard_connection(self.c.destination_shard, read=False) as conn: self.log('Starting import') with db.cursor(conn) as cur: sql = ( ( "LOAD DATA INFILE '%(infile)s' " # TODO(jpatrin): If the mysql source had timestamp fields with the special value # '0000-00-00 00:00:00' then the imports will fail if the mysql server is set to be strict about # timestamp types. In this case adding IGNORE below will fix the issue, but may mask other # issues. If this issue recurs we should probably add special support to the migrator to handle # the special '0000-00-00 00:00:00' timestamp value, similar to what is done for NULL. # "IGNORE " "INTO TABLE %(schema)s.%(table)s CHARACTER SET utf8 " # We use | as an escape character as MySQL's default of \ is problematic and at some times is # not compatible with the csv module's parser. """FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY '|' """ "LINES TERMINATED BY '\\n' " '(%(columns)s) SET %(sets)s' ) % { 'infile': self.c.import_filename, 'schema': self.c.destination_schema, 'table': self.c.table_config.table_name, 'columns': ', '.join( ('' if col.ignore else '@') + col.name for col in self.c.export_columns ), 'sets': ', '.join( "%s = IF(%s = '%s', NULL, %s)" % ( col.name, '@' + col.name, NULL_SENTINEL, conn.column_insert_sql(col) % {'?': '@' + col.name} ) for col in self.c.export_columns if not col.ignore ), } ) cur.execute(sql) self.c.num_records_imported = cur.rowcount conn.commit()
def get_column_metadata(self, conn): columns = [] with db.cursor(conn, dictionary=True) as cur: cur.execute('DESCRIBE %s.%s' % (self.c.source_schema, self.c.table_config.table_name)) for column in cur.fetchall(): alias = '%s.' % (self.c.table_config.table_alias,) if self.c.table_config.table_alias else '' base_type = column['Type'].lower().split('(')[0] # Ignore anything after an opening parenthesis col = db.Column( column['Field'], self.TYPE_MAP.get(base_type, db.ColumnType.STRING), column['Key'] == 'PRI', ignore=(column['Field'].lower() in self.c.table_config.ignore_columns), source_alias=alias ) columns.append(col) self.column_map[col.lname] = col if ( self.c.migration_type == orm.MigrationType.DELTA or self.c.chunk_migration_type == orm.ChunkMigrationType.DIRECT ): # Check the destination for the primary key columns as well since the schemas may be different with db.shard_connection(self.c.destination_shard, read=True) as conn: primary_key = conn.get_table_primary_key_columns( self.c.destination_schema, self.c.table_config.table_name ) for col_name in primary_key: if col_name.lower() not in self.column_map: raise UnrecoverableError( 'Primary key column in destination does not exist in source ' 'table=%s column=%s source=%s destination=%s' % ( self.c.table_config.table_name, col_name, self.c.source_shard, self.c.destination_shard)) self.column_map[col_name.lower()].is_primary_key = True self.c.columns = columns
def check_table(self, table_config, conn): table_data = orm.Table.get( self.redis_conn, migration_id=self.c.migration_id, partition_val=self.c.partition_val, namespace=self.c.namespace, source_shard=self.c.source_shard, destination_shard=self.c.destination_shard, table_name=table_config.table_name) if table_data is None: return None with db.cursor(conn) as cur: # update the table_config so logging is correct self.c.table_config = table_config # TODO(jpatrin): Add join support for non-crate destinations # TODO(jpatrin): Disabling min and max checks for now as the query is different for crate vs. mysql if self.c.migration_type == orm.MigrationType.DELTA: if table_config.join: self.log_warning('Verification is unsupported for tables in delta migrations with a join clause') table_data.verification_status = 'unknown' table_data.update() return table_data elif self.c.migration_type != orm.MigrationType.FULL: raise shinkansen.UnrecoverableError('Migration type %r unknown' % (self.c.migration_type,)) # TODO(jpatrin): The verifier should technically take the join clause into account so it gets the same # result as the queuer and exporter, but crate doesn't support joins with aggregation. As long as the # destination only has the records we have inserted into it the join shouldn't be needed, though. (where_clauses, where_values) = worker.generate_where(conn, self.c, table_config) sql = ( 'SELECT COUNT(*) ' # , MIN(%(chunk_col)s), MAX(%(chunk_col)s) ' 'FROM %(schema)s.%(table)s %(table_alias)s %(where)s' # %(join)s ) % { # 'chunk_col': chunk_col, 'schema': self.c.destination_schema, 'table': table_config.table_name, 'table_alias': table_config.table_alias, # 'join': (self.c.table_config.join % {'schema': self.c.destination_schema} # if self.c.migration_type == orm.MigrationType.DELTA # or self.c.chunk_migration_type == orm.ChunkMigrationType.DIRECT # else ''), 'where': (' WHERE ' + (' AND '.join(where_clauses))) if where_clauses else '' } cur.execute(sql % {'?': conn.PARAMETER_PLACEHOLDER}, where_values) #(num_rows, min_id, max_id) = cur.fetchone() (num_rows,) = cur.fetchone() errors = [] if table_data.num_records != num_rows: errors.append('The queued number of rows (%r) and the resulting number of rows (%r) do not match' % ( table_data.num_records, num_rows)) #if table_data.min_id != min_id: # errors.append('The queued min_id (%r) and the resulting min_id (%r) do not match' % ( # table_data.min_id, min_id)) #if table_data.max_id != max_id: # errors.append('The queued max_id (%r) and the resulting max_id (%r) do not match' % ( # table_data.max_id, max_id)) if len(errors) > 0: self.log_error('Verification errors: %s', ', '.join(errors)) table_data.verification_status = 'failed' else: self.log('Verification succeeded') table_data.verification_status = 'verified' table_data.update() return table_data
def _run(self): time.sleep(random.randint(0, 10)) start = datetime.now() if self.c.source_type != 'mysql': raise UnrecoverableError('This exporter only supports mysql sources, passed in source_type was %r' % ( self.c.source_type,)) self.chunk.status = 'exporting' self.chunk.start_time = int(time.time() * 1000) self.chunk.update() source_host = config.SOURCES[self.c.source_shard]['config']['read_host']['host'] with db.shard_connection(self.c.source_shard, read=True) as mysql_conn: self.log('Starting export') mysql_host_user = '******' % ( config.SSH_USER, source_host) cmd = ( 'sudo mkdir -p %(source_dir)s && sudo chmod 777 %(source_dir)s; ' '[ ! -e %(outfile)s ] || sudo rm %(outfile)s' ) % { 'tmp_dir': config.TMP_DIR, 'source_dir': worker.SOURCE_DIR, 'outfile': self.c.export_filename, } sshcmd = ( 'ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR ' '-o ControlMaster=no -o ControlPath=none ' '-p %r -i %s %s "%s"' % ( config.MYSQL_SSH_PORT, config.SSH_PRIVKEY, mysql_host_user, ssh.escape_double_quotes(cmd))) rm_existing = subprocess.Popen( sshcmd, shell=True, stdin=subprocess.PIPE) rm_existing.stdin.close() rm_existing.wait() if rm_existing.returncode != 0: raise CommandException('Checking for and removing export file failed with exit code %r' % ( rm_existing.returncode,)) # Inline trimming may be in self.c.where_clauses wheres = copy.deepcopy(self.c.where_clauses) where_values = copy.deepcopy(self.c.where_values) wheres.extend([ '%s >= %%(?)s' % (self.c.table_config.chunk_col,), '%s < %%(?)s' % (self.c.table_config.chunk_col,), ]) where_values.extend([ self.c.start_id, self.c.start_id + self.c.chunk_size, ]) # We want to kick off the streamer here to enable streaming while the export file is still being written # but if we do and the export below fails the streamer will be stuck in an infinite loop and the next time # the export task gets retried we'll kick off yet another streamer task, potentially corrupting data. # TODO(jpatrin): A fix for this would be to use a random token so the streamer knows it belongs to the # running exporter. Before kicking off the streamer, write a file with a random UUID next to where the # exported file will be. Put the token in self.c so it gets passed to the streamer. When the streamer # starts up, read the token file and check it vs. the token in self.c. If it's different, mark the chunk # as failed, end the streamer without doing anything, and don't retry the task. #streamer.queue_stream_chunk(self.c) # TODO(jpatrin): Don't we need to add the join clause here to make the where_clauses work for inline # trimming? sql = ( ( "SELECT %(columns)s INTO OUTFILE '%(outfile)s' CHARACTER SET utf8 " # We use | as an escape character as MySQL's default of \ is problematic and at some times is not # compatible with the csv module's parser. """FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY '|' """ "LINES TERMINATED BY '\\n' " "FROM %(schema)s.%(table)s %(table_alias)s WHERE %(wheres)s" # Adding sorting here only slows down the query. # "ORDER BY %(chunk_col)s ASC" ) % { 'columns': ', '.join( ( # We're using NULL_SENTINEL here because MySQL uses a nonstandard value for null # in its OUTFILE which is not as easy to detect and convert as I'd like. "IF(%s IS NULL, '%s', %s)" % ( col.name, NULL_SENTINEL, mysql_conn.column_query_sql(col) ) ) for col in self.c.export_columns ), 'outfile': self.c.export_filename, 'schema': self.c.source_schema, 'table': self.c.table_config.table_name, 'table_alias': self.c.table_config.table_alias, 'wheres': ' AND '.join(wheres), } ) with db.cursor(mysql_conn) as cur: cur.execute(sql % {'?': mysql_conn.PARAMETER_PLACEHOLDER}, where_values) self.c.num_records_exported = cur.rowcount # kick off the streamer streamer.queue_stream_chunk(self.c) self.chunk.num_records_exported = self.c.num_records_exported self.chunk.export_elapsed_ms = int((datetime.now() - start).total_seconds() * 1000) self.chunk.update() # The streaming may or may not have started, so we only update the status if it's still set to exporting self.chunk.update_status('exported', where_status='exporting') # signal to the processor that we have reached the end of the data self.log('Signaling EOF to conversion') cmd = 'sudo bash -c "echo STOP >> %s"' % ( ssh.escape_double_quotes(self.c.export_filename), ) signal_stop = subprocess.Popen( 'ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR ' '-p %r -i %s %s@%s "%s"' % ( config.MYSQL_SSH_PORT, config.SSH_PRIVKEY, config.SSH_USER, source_host, ssh.escape_double_quotes(cmd)), shell=True, stdin=subprocess.PIPE) signal_stop.stdin.close() signal_stop.wait() self.log('Finished chunk export num_records_exported=%s elapsed=%s', self.c.num_records_exported, datetime.now() - start)
def get_count(conn, sql): with db.cursor(conn) as cur: cur.execute('SELECT COUNT(*) ' + sql) (num_rows,) = cur.fetchone() return num_rows
def _run(self): time.sleep(random.randint(0, 10)) start = datetime.now() if self.c.source_type != 'crate': raise UnrecoverableError('This exporter only supports crate sources, passed in source_type was %r' % ( self.c.source_type,)) self.chunk.status = 'exporting' self.chunk.start_time = int(time.time() * 1000) self.chunk.update() with db.shard_connection(self.c.source_shard, read=True) as crate_conn: self.log('Starting export') self.c.export_dir = os.path.join(worker.SOURCE_DIR, ('%s_%s.%s_%s_%s' % ( self.c.migration_id, self.c.source_schema, self.c.table_config.table_name, self.c.partition_val, self.c.chunk_num))) def make_source_dir(node): cmd = ( 'sudo mkdir -p %(export_dir)s && sudo chmod 777 %(export_dir)s' ) % { 'export_dir': self.c.export_dir, } try: ssh.SSHHost( node['host'], node['ssh_port'], config.SSH_USER, identity=config.SSH_PRIVKEY ).run(cmd) except ssh.SSHException, e: raise CommandException('Checking for and removing export file failed %r' % (e,)) crate_cluster = config.DESTINATIONS[self.c.source_shard] data_nodes = crate_cluster['config']['data_nodes'] threads = [] for node in data_nodes: thread = threading.Thread(target=make_source_dir, args=(node,)) thread.start() threads.append(thread) for thread in threads: thread.join() # Inline trimming may be in self.c.where_clauses wheres = copy.deepcopy(self.c.where_clauses) where_values = copy.deepcopy(self.c.where_values) wheres.extend([ '%s >= %%(?)s' % (self.c.table_config.chunk_col,), '%s < %%(?)s' % (self.c.table_config.chunk_col,), ]) where_values.extend([ self.c.start_id, self.c.start_id + self.c.chunk_size, ]) if self.c.table_config.join: # TODO: This is a HORRIBLE HACK that removes the hard-coded table alias from the partition and chunk # columns. # It is likely to cause problems. A refactoring of the join code is needed. column_alias = '%s.' % (self.c.table_config.table_alias,) fixed_wheres = [] for where in wheres: if where.startswith(column_alias): where = where[len(column_alias):] fixed_wheres.append(where) wheres = fixed_wheres # We want to kick off the streamer here to enable streaming while the export file is still being written # but if we do and the export below fails the streamer will be stuck in an infinite loop and the next time # the export task gets retried we'll kick off yet another streamer task, potentially corrupting data. # TODO(jpatrin): A fix for this would be to use a random token so the streamer knows it belongs to the # running exporter. Before kicking off the streamer, write a file with a random UUID next to where the # exported file will be. Put the token in self.c so it gets passed to the streamer. When the streamer # starts up, read the token file and check it vs. the token in self.c. If it's different, mark the chunk # as failed, end the streamer without doing anything, and don't retry the task. # NOTE: This would only work with crate if we append the STOP sentinel to all of the json files that each # node writes to export_dir or if we come up with another way to know when the export is finished. #streamer.queue_stream_chunk(self.c) # TODO(jpatrin): Don't we need to add the join clause here to make the where_clauses work for inline # trimming? They won't work with COPY TO, but we may need to figure out a way to support it. sql = ( "COPY %(schema)s.%(table)s (%(columns)s) " "WHERE %(wheres)s " "TO DIRECTORY '%(export_dir)s' " "WITH (format='json_array') " ) % { 'columns': ', '.join(col.name for col in self.c.export_columns), 'schema': self.c.source_schema, 'table': self.c.table_config.table_name, 'wheres': ' AND '.join(wheres), 'export_dir': self.c.export_dir, } #self.log_warning('%s', sql) #self.log_warning(repr(where_values)) with db.cursor(crate_conn) as cur: cur.execute(sql % {'?': crate_conn.PARAMETER_PLACEHOLDER}, where_values) self.c.num_records_exported = cur.rowcount
def migrate(self, source_conn, dest_conn): # TODO: Implement option #2 as well as large numbers of records will not work with option #1 and crate. # The python crate client always loads all result records into memory before returning anything # (it doesn't support streaming results, like any normal DB API would), so we need to either support # both of these options and switch between them at some threshold of record count or just use the second # option. # When migrating from crate we need to either have an explicit limit or work around the implicit limit of # 10000 records. In order to make this work we need to do one of the following: # # 1) pre-query for the count and use that as the explicit limit (plus a fudge factor? multiplied?) # * prone to errors if the number of records in the chunk would change between query time and the final # SELECT. If the number of records increases enough between the COUNT and the SELECT in the result then # the query could miss some of the records and not get picked up by a later autodelta as they would not # have been updated. # * Can mitigate by running the count then multiplying it by 2. Use that as the explicit LIMIT. Check the # number of records we got vs. that limit. If we got exactly that number of records we need to try again # with the limit doubled again. # * This means we're re-doing all of the work but the possibility of this happening should be low enough # that this only happens in extreme circumstances. # # 2) add an ORDER BY and use the ordered column to query for more records after each SELECT finishes. # * adding ORDER BY slows down the query and adds load to the source database # * prone to missing records which might have been inserted below any record we get on each loop # i.e. assumes that the ordered field is an always increasing id field, like a mysql autoincrement id # * using autodelta migrations (or a complete migration) should mean that any records potentially missed # would be picked up by the delta migrations # * ORDER BY and LIMIT only works if we have a unique column or primary key to use as an extra WHERE clause. # Any primary key could potentially work but the problem is defining the where clause to get a part of the # ordering. # * This should be fixable by using ORDER BY and LIMIT with OFFSET, but this is less efficient than the extra # WHERE clause as it means the server needs to scan the results to the OFFSET value each time. # # We need to implement #2 due to crate-python's inability to stream results. Very large results will not only # slow down the processing but are likely to cause memory errors in this module. # # XHGUTUYGJHGH crate can't sort by a partitioned column. If the primary key has a partition column then we # can't use the ORDER, LIMIT, WHERE pk > max option. # The only option in this case is to use ORDER, LIMIT, OFFSET while ordering only by the non-partition primary # key columns. # # What about a table where a single primary key column is also the partition column? We could potentially use # all indexed non-partition columns in the ORDER BY but this slows down the query. # # Maybe the right thing would just be to use COPY WHERE TO and stream the json files from the data nodes and # upsert them to the destination from there. That way we don't have to perform any heroics to get any data # out of crate. The downside, of course, is that we'll need to ssh to the data nodes to stream the files. # * Can use COPY table (columns...) WHERE ... TO DIRECTORY ... WITH (format='json_array') to reduce duplication # of keys in the json. wheres = deepcopy(self.c.where_clauses) + [ '%s >= %%(?)s' % (self.c.table_config.chunk_col,), '%s < %%(?)s' % (self.c.table_config.chunk_col,), ] base_values = deepcopy(self.c.where_values) + [ self.c.start_id, self.c.start_id + self.c.chunk_size, ] if self.c.migration_type == orm.MigrationType.DELTA and self.c.table_config.join: if self.c.source_type == 'crate': self.log('Chunk cannot be piped, migration type is delta, table has a join, and source is crate. ' 'Chunk will be exported and streamed instead.') self.chunk.status = 'queued' self.chunk.update() exporter.queue_export_chunk(self.c) return False join = self.c.table_config.join % {'schema': self.c.source_schema} else: join = '' base_sql = ( 'FROM %(schema)s.%(table)s %(table_alias)s %(join)s ' 'WHERE %(where_clauses)s' ) % { 'schema': self.c.source_schema, 'table': self.c.table_config.table_name, 'table_alias': self.c.table_config.table_alias, 'join': join, 'where_clauses': ' AND '.join(wheres) } # crate has an implicit limit of 10000, we query for the count here to make sure we get all # of the records if self.c.source_type == 'crate': self.log('Querying for chunk size') sql = 'SELECT COUNT(*) %s' % (base_sql,) with db.cursor(source_conn) as source_cur: source_cur.execute(sql % {'?': source_conn.PARAMETER_PLACEHOLDER}, base_values) (count,) = source_cur.fetchone() if not count: self.log('No data found for chunk') return True use_order = count > config.MAX_CRATE_RESULT_SIZE if use_order: ( primary_key_indexes, primary_key_columns, ) = zip(*[ ( i, col, ) for (i, col) in enumerate(self.c.export_columns) if col.is_primary_key ]) use_offset = False if not primary_key_columns: self.log_warning('Table has no primary key columns') use_offset = True else: unorderable_columns = [ col.lower() for col in source_conn.get_unorderable_columns( self.c.source_schema, self.c.table_config.table_name ) ] if any(pkc.name.lower() in unorderable_columns for pkc in primary_key_columns): self.log_warning('Table has primary key columns that cannot be used for sorting') use_offset = True if use_offset: self.log('Chunk cannot be piped, it must be exported and streamed') self.chunk.status = 'queued' self.chunk.update() exporter.queue_export_chunk(self.c) return False # self.log_warning( # 'Falling back to full ordering and LIMIT OFFSET querying. Depending on the cardinality of the ' # 'fields this may be very expensive or miss records depending on whether the sort order is ' # 'deterministic.' # ) # ( # key_indexes, # key_columns # ) = zip(*[ # ( # i, # col, # ) # for (i, col) in enumerate(self.c.export_columns) # if col.name.lower() not in unorderable_columns # ]) else: self.log( 'Chunk size (%u) is larger than the configured MAX_CRATE_RESULT_SIZE (%u). ' 'This chunk will be broken up into multiple ordered queries.' % ( count, config.MAX_CRATE_RESULT_SIZE, ) ) key_indexes = primary_key_indexes key_columns = primary_key_columns # if not key_columns: # raise UnrecoverableError( # 'No sortable columns found, cannot migrate this chunk as it is larger ' # 'than MAX_CRATE_RESULT_SIZE' # ) limit = config.MAX_CRATE_RESULT_SIZE order_sql = 'ORDER BY %s LIMIT %u' % ( ', '.join(col.name for col in key_columns), limit ) else: limit = count else: use_order = False self.chunk.num_records_exported = 0 self.chunk.update() key_max_values = [] while True: # TODO: Refactor? if use_order: if use_offset: raise UnrecoverableError('Implement ORDER LIMIT OFFSET?') else: if key_max_values: # WHERE c1 > mv1 OR c1 == mv1 AND (c2 > mv2 OR c2 == mv2 AND c3 > mv3) ( primary_key_sql, primary_key_values, ) = generate_primary_key_sql(key_columns, key_max_values) loop_values = list(base_values) + list(primary_key_values) loop_base_sql = '%s AND %s %s' % ( base_sql, primary_key_sql, order_sql, ) else: loop_base_sql = '%s %s' % ( base_sql, order_sql, ) loop_values = base_values elif self.c.source_type == 'crate': limit *= 2 loop_base_sql = '%s LIMIT %u' % (base_sql, limit) loop_values = base_values else: loop_base_sql = base_sql loop_values = base_values sql = ( 'SELECT %s %s' ) % ( ', '.join( source_conn.column_query_sql(col) for col in self.c.export_columns ), loop_base_sql, ) if not use_order: self.chunk.num_records_exported = 0 self.chunk.update() with db.cursor(source_conn) as source_cur: source_cur.execute(sql % {'?': source_conn.PARAMETER_PLACEHOLDER}, loop_values) num_recs = 0 while True: records = source_cur.fetchmany(config.PIPE_BULK_INSERT_SIZE) if not records: break self.chunk.num_records_exported += len(records) self.chunk.update() num_recs += len(records) if use_order and not use_offset: new_key_max_values = [] for col_idx_idx in xrange(len(key_indexes)): max_val = max(r[key_indexes[col_idx_idx]] for r in records) if key_max_values: max_val = max(max_val, key_max_values[col_idx_idx]) new_key_max_values.append(max_val) key_max_values = new_key_max_values self.log_debug('Got records from source num_records=%s', len(records)) self.upsert(dest_conn, records) if self.c.source_type == 'crate': if use_order: if num_recs < limit: self.log('Chunk finished num_recs=%u limit=%u', num_recs, limit) break else: self.log( 'Chunk has more records key_max_values=%r num_recs=%u limit=%u', key_max_values, num_recs, limit ) # if we got as many records as the limit set above it is likely there were more than that to get # so we need to loop and do it over with a higher limit elif self.chunk.num_records_exported < limit: break else: self.log( 'The number of records has grown more than double, retrying with double the limit limit=%u', limit ) else: break return True