def queue_chunks(self): self.c.chunk_size = max( 1, int( math.ceil( (self.max_id - self.min_id + 1) / math.ceil(float(self.num_rows) / float(config.CHUNK_SIZE))))) # Start at -1 and immediately increment it so that, even if we continue below, we always increase the chunk_num chunk_num = -1 for start_id in xrange(self.min_id, self.max_id + 1, self.c.chunk_size): chunk_num += 1 chunk_config = deepcopy(self.c) chunk_config.chunk_num = chunk_num chunk_config.start_id = start_id chunk_config.set_filenames() if self.c.migration_type == orm.MigrationType.FULL: ttcs = orm.Chunk.get_by_index( self.redis_conn, # TODO(jpatrin): Add start_id and chunk_size? partition_val=chunk_config.partition_val, table_name=chunk_config.table_config.table_name, chunk_num=chunk_config.chunk_num, namespace=chunk_config.namespace, source_shard=chunk_config.source_shard, destination_shard=chunk_config.destination_shard) found_ttc = False for ttc in ttcs: if ( ttc.status in ('imported', 'empty') # If we were passed a latest migration, only take into account chunks from that migration and (self.c.latest_migration_id is None or ttc.migration_id == self.c.latest_migration_id) ): if not self.c.force: self.log('Chunk already %s, not requeueing', ttc.status) found_ttc = True break self.log('Chunk already %s but reimport forced', ttc.status) if found_ttc: continue ttc = orm.Chunk(self.redis_conn) for col in ttc._cols(): if hasattr(chunk_config, col): setattr(ttc, col, getattr(chunk_config, col)) elif hasattr(chunk_config.table_config, col): setattr(ttc, col, getattr(chunk_config.table_config, col)) ttc.queued_time = int(time.time() * 1000) ttc.status = 'queued' ttc.insert() if self.c.migration_type == orm.MigrationType.FULL: if self.c.chunk_migration_type == orm.ChunkMigrationType.INDIRECT: exporter.queue_export_chunk(chunk_config) elif self.c.chunk_migration_type == orm.ChunkMigrationType.DIRECT: pipe.queue_pipe_chunk(chunk_config) elif self.c.migration_type == orm.MigrationType.DELTA: pipe.queue_pipe_chunk(chunk_config) else: raise UnrecoverableError('Unknown migration type %r' % (self.c.migration_type,)) self.num_chunks = chunk_num + 1
def migrate(self, source_conn, dest_conn): # TODO: Implement option #2 as well as large numbers of records will not work with option #1 and crate. # The python crate client always loads all result records into memory before returning anything # (it doesn't support streaming results, like any normal DB API would), so we need to either support # both of these options and switch between them at some threshold of record count or just use the second # option. # When migrating from crate we need to either have an explicit limit or work around the implicit limit of # 10000 records. In order to make this work we need to do one of the following: # # 1) pre-query for the count and use that as the explicit limit (plus a fudge factor? multiplied?) # * prone to errors if the number of records in the chunk would change between query time and the final # SELECT. If the number of records increases enough between the COUNT and the SELECT in the result then # the query could miss some of the records and not get picked up by a later autodelta as they would not # have been updated. # * Can mitigate by running the count then multiplying it by 2. Use that as the explicit LIMIT. Check the # number of records we got vs. that limit. If we got exactly that number of records we need to try again # with the limit doubled again. # * This means we're re-doing all of the work but the possibility of this happening should be low enough # that this only happens in extreme circumstances. # # 2) add an ORDER BY and use the ordered column to query for more records after each SELECT finishes. # * adding ORDER BY slows down the query and adds load to the source database # * prone to missing records which might have been inserted below any record we get on each loop # i.e. assumes that the ordered field is an always increasing id field, like a mysql autoincrement id # * using autodelta migrations (or a complete migration) should mean that any records potentially missed # would be picked up by the delta migrations # * ORDER BY and LIMIT only works if we have a unique column or primary key to use as an extra WHERE clause. # Any primary key could potentially work but the problem is defining the where clause to get a part of the # ordering. # * This should be fixable by using ORDER BY and LIMIT with OFFSET, but this is less efficient than the extra # WHERE clause as it means the server needs to scan the results to the OFFSET value each time. # # We need to implement #2 due to crate-python's inability to stream results. Very large results will not only # slow down the processing but are likely to cause memory errors in this module. # # XHGUTUYGJHGH crate can't sort by a partitioned column. If the primary key has a partition column then we # can't use the ORDER, LIMIT, WHERE pk > max option. # The only option in this case is to use ORDER, LIMIT, OFFSET while ordering only by the non-partition primary # key columns. # # What about a table where a single primary key column is also the partition column? We could potentially use # all indexed non-partition columns in the ORDER BY but this slows down the query. # # Maybe the right thing would just be to use COPY WHERE TO and stream the json files from the data nodes and # upsert them to the destination from there. That way we don't have to perform any heroics to get any data # out of crate. The downside, of course, is that we'll need to ssh to the data nodes to stream the files. # * Can use COPY table (columns...) WHERE ... TO DIRECTORY ... WITH (format='json_array') to reduce duplication # of keys in the json. wheres = deepcopy(self.c.where_clauses) + [ '%s >= %%(?)s' % (self.c.table_config.chunk_col,), '%s < %%(?)s' % (self.c.table_config.chunk_col,), ] base_values = deepcopy(self.c.where_values) + [ self.c.start_id, self.c.start_id + self.c.chunk_size, ] if self.c.migration_type == orm.MigrationType.DELTA and self.c.table_config.join: if self.c.source_type == 'crate': self.log('Chunk cannot be piped, migration type is delta, table has a join, and source is crate. ' 'Chunk will be exported and streamed instead.') self.chunk.status = 'queued' self.chunk.update() exporter.queue_export_chunk(self.c) return False join = self.c.table_config.join % {'schema': self.c.source_schema} else: join = '' base_sql = ( 'FROM %(schema)s.%(table)s %(table_alias)s %(join)s ' 'WHERE %(where_clauses)s' ) % { 'schema': self.c.source_schema, 'table': self.c.table_config.table_name, 'table_alias': self.c.table_config.table_alias, 'join': join, 'where_clauses': ' AND '.join(wheres) } # crate has an implicit limit of 10000, we query for the count here to make sure we get all # of the records if self.c.source_type == 'crate': self.log('Querying for chunk size') sql = 'SELECT COUNT(*) %s' % (base_sql,) with db.cursor(source_conn) as source_cur: source_cur.execute(sql % {'?': source_conn.PARAMETER_PLACEHOLDER}, base_values) (count,) = source_cur.fetchone() if not count: self.log('No data found for chunk') return True use_order = count > config.MAX_CRATE_RESULT_SIZE if use_order: ( primary_key_indexes, primary_key_columns, ) = zip(*[ ( i, col, ) for (i, col) in enumerate(self.c.export_columns) if col.is_primary_key ]) use_offset = False if not primary_key_columns: self.log_warning('Table has no primary key columns') use_offset = True else: unorderable_columns = [ col.lower() for col in source_conn.get_unorderable_columns( self.c.source_schema, self.c.table_config.table_name ) ] if any(pkc.name.lower() in unorderable_columns for pkc in primary_key_columns): self.log_warning('Table has primary key columns that cannot be used for sorting') use_offset = True if use_offset: self.log('Chunk cannot be piped, it must be exported and streamed') self.chunk.status = 'queued' self.chunk.update() exporter.queue_export_chunk(self.c) return False # self.log_warning( # 'Falling back to full ordering and LIMIT OFFSET querying. Depending on the cardinality of the ' # 'fields this may be very expensive or miss records depending on whether the sort order is ' # 'deterministic.' # ) # ( # key_indexes, # key_columns # ) = zip(*[ # ( # i, # col, # ) # for (i, col) in enumerate(self.c.export_columns) # if col.name.lower() not in unorderable_columns # ]) else: self.log( 'Chunk size (%u) is larger than the configured MAX_CRATE_RESULT_SIZE (%u). ' 'This chunk will be broken up into multiple ordered queries.' % ( count, config.MAX_CRATE_RESULT_SIZE, ) ) key_indexes = primary_key_indexes key_columns = primary_key_columns # if not key_columns: # raise UnrecoverableError( # 'No sortable columns found, cannot migrate this chunk as it is larger ' # 'than MAX_CRATE_RESULT_SIZE' # ) limit = config.MAX_CRATE_RESULT_SIZE order_sql = 'ORDER BY %s LIMIT %u' % ( ', '.join(col.name for col in key_columns), limit ) else: limit = count else: use_order = False self.chunk.num_records_exported = 0 self.chunk.update() key_max_values = [] while True: # TODO: Refactor? if use_order: if use_offset: raise UnrecoverableError('Implement ORDER LIMIT OFFSET?') else: if key_max_values: # WHERE c1 > mv1 OR c1 == mv1 AND (c2 > mv2 OR c2 == mv2 AND c3 > mv3) ( primary_key_sql, primary_key_values, ) = generate_primary_key_sql(key_columns, key_max_values) loop_values = list(base_values) + list(primary_key_values) loop_base_sql = '%s AND %s %s' % ( base_sql, primary_key_sql, order_sql, ) else: loop_base_sql = '%s %s' % ( base_sql, order_sql, ) loop_values = base_values elif self.c.source_type == 'crate': limit *= 2 loop_base_sql = '%s LIMIT %u' % (base_sql, limit) loop_values = base_values else: loop_base_sql = base_sql loop_values = base_values sql = ( 'SELECT %s %s' ) % ( ', '.join( source_conn.column_query_sql(col) for col in self.c.export_columns ), loop_base_sql, ) if not use_order: self.chunk.num_records_exported = 0 self.chunk.update() with db.cursor(source_conn) as source_cur: source_cur.execute(sql % {'?': source_conn.PARAMETER_PLACEHOLDER}, loop_values) num_recs = 0 while True: records = source_cur.fetchmany(config.PIPE_BULK_INSERT_SIZE) if not records: break self.chunk.num_records_exported += len(records) self.chunk.update() num_recs += len(records) if use_order and not use_offset: new_key_max_values = [] for col_idx_idx in xrange(len(key_indexes)): max_val = max(r[key_indexes[col_idx_idx]] for r in records) if key_max_values: max_val = max(max_val, key_max_values[col_idx_idx]) new_key_max_values.append(max_val) key_max_values = new_key_max_values self.log_debug('Got records from source num_records=%s', len(records)) self.upsert(dest_conn, records) if self.c.source_type == 'crate': if use_order: if num_recs < limit: self.log('Chunk finished num_recs=%u limit=%u', num_recs, limit) break else: self.log( 'Chunk has more records key_max_values=%r num_recs=%u limit=%u', key_max_values, num_recs, limit ) # if we got as many records as the limit set above it is likely there were more than that to get # so we need to loop and do it over with a higher limit elif self.chunk.num_records_exported < limit: break else: self.log( 'The number of records has grown more than double, retrying with double the limit limit=%u', limit ) else: break return True