def __cleanup_previous_runs(self): if (not yes_no( 'The next step will kill all mongodb processes and wipe out the data path.\n' + 'Proceed (yes/no)? ')): raise KeyboardInterrupt('User disallowed cleanup of the data path') # Iterate through all processes and kill mongod and mongos for process in psutil.process_iter(): try: processExecutable = os.path.basename(process.exe()) except psutil.NoSuchProcess: pass except psutil.AccessDenied: pass else: if (processExecutable in [exe_name('mongod'), exe_name('mongos')]): process.kill() process.wait() # Remove the output directories try: shutil.rmtree(self.introspectRoot) except FileNotFoundError: pass try: shutil.rmtree(self.clusterRoot) except FileNotFoundError: pass
def __cleanup_previous_runs(self): if (not yes_no('The next step will kill all mongodb processes and wipe out the data path.\n' + 'Proceed (yes/no)? ')): return False # Iterate through all processes and kill mongod and mongos for process in psutil.process_iter(): try: processExecutable = os.path.basename(process.exe()) except psutil.NoSuchProcess: pass except psutil.AccessDenied: pass else: if (processExecutable in [exe_name('mongod'), exe_name('mongos')]): process.kill() process.wait() # Remove the output directory shutil.rmtree(self.dir) return True
def __init__(self, config, introspect): self._config = config self._introspect = introspect if config.numShards is None: numShards = introspect.configDb.shards.count_documents({}) else: numShards = config.numShards if (numShards > 10): if (not yes_no( 'The imported configuration data contains large number of shards (' + str(numShards) + '). Proceeding will start large number of mongod processes.\n' + 'Are you sure you want to continue (yes/no)? ')): raise KeyboardInterrupt('Too many shards will be created') config.mlaunch_action('init', config.clusterRoot, [ '--sharded', str(numShards), '--replicaset', '--nodes', '1', '--csrs', '--mongos', '1', '--port', str(config.clusterStartingPort), '--wiredTigerCacheSizeGB', '0.25', '--oplogSize', '50' ]) # Set the correct FCV on the cluster being reconstructed clusterConnection = MongoClient('localhost', config.clusterStartingPort) self._config.log_line( clusterConnection.admin.command('setFeatureCompatibilityVersion', introspect.FCV)) # TODO: Find a better way to determine the port of the config server's primary self.configServerPort = config.clusterStartingPort + (numShards + 1) configServerConnection = MongoClient('localhost', self.configServerPort) self.configDb = configServerConnection.config
def main(): argsParser = argparse.ArgumentParser( description= 'Tool to interpret an export of a cluster config database and construct a new cluster with' 'exactly the same configuration. Requires mlaunch to be installed and in the system path.') argsParser.add_argument('--binarypath', help='Directory containing the MongoDB binaries', metavar='binarypath', type=str, required=True) argsParser.add_argument( '--dir', help='Directory in which to place the data files (will create subdirectories)', metavar='dir', type=str, required=True) argsParser.add_argument('configdumpdir', help='Directory containing a dump of the cluster config database', metavar='configdumpdir', type=str, nargs=1) config = ToolConfiguration(argsParser.parse_args()) # Read the cluster configuration from the preprocess instance and construct the new cluster introspect = ClusterIntrospect(config) numShards = introspect.configDb.shards.count({}) if (numShards > 10): if (not yes_no('The imported configuration data contains large number of shards (' + str( numShards) + '). Proceeding will start large number of mongod processes.\n' + 'Are you sure you want to continue (yes/no)? ')): return 1 # Make the output directories mongodClusterRootPath = os.path.join(config.dir, 'cluster_root') os.makedirs(mongodClusterRootPath) config.mlaunch_action('init', mongodClusterRootPath, [ '--replicaset', '--nodes', '1', '--sharded', str(numShards), '--csrs', '--mongos', '1', '--port', str(config.clusterStartingPort) ]) configServerPort = config.clusterStartingPort + numShards + 1 config.restore_config_db_to_port(configServerPort) configServerConnection = MongoClient('localhost', configServerPort) configServerConfigDB = configServerConnection.config # Rename the shards from the dump to the shards launched by mlaunch in the shards collection print('Renaming shards in the shards collection:') # Shards that came with the config dump SHARDS_FROM_DUMP = list(introspect.configDb.shards.find({}).sort('_id', 1)) # Shards that mlaunch generated (should be the same size as SHARDS_FROM_DUMP) SHARDS_FROM_MLAUNCH = list( configServerConfigDB.shards.find({ '_id': { '$not': { '$in': list(map(lambda x: x['_id'], SHARDS_FROM_DUMP)) } } }).sort('_id', 1)) assert (len(SHARDS_FROM_DUMP) == len(SHARDS_FROM_MLAUNCH)) shardsToInsert = [] for shardFromDump, shardFromMlaunch in zip(deepcopy(SHARDS_FROM_DUMP), SHARDS_FROM_MLAUNCH): shardFromDump['_id'] = shardFromMlaunch['_id'] shardFromDump['host'] = shardFromMlaunch['host'] shardsToInsert.append(shardFromDump) # Wipe out all the shards (both from the dump and from mlaunch) result = configServerConfigDB.shards.delete_many({}) config.log_line(result.raw_result) # Patch the _id and host of the shards in the config dump for shardToInsert in shardsToInsert: result = configServerConfigDB.shards.insert(shardToInsert) config.log_line(result) # Rename the shards from the dump to the shards launched by mlaunch in the metadata print('Renaming shards in the routing metadata:') for shardIdFromDump, shardIdFromMlaunch in zip( list(map(lambda x: x['_id'], SHARDS_FROM_DUMP)), list(map(lambda x: x['_id'], SHARDS_FROM_MLAUNCH))): print('Shard ' + shardIdFromDump + ' becomes ' + shardIdFromMlaunch) result = configServerConfigDB.databases.update_many({ 'primary': shardIdFromDump }, {'$set': { 'primary': shardIdFromMlaunch }}) config.log_line(result.raw_result) # Rename the shards in the chunks' current owner field result = configServerConfigDB.chunks.update_many({ 'shard': shardIdFromDump }, {'$set': { 'shard': shardIdFromMlaunch }}) config.log_line(result.raw_result) # Rename the shards in the chunks' history result = configServerConfigDB.chunks.update_many({ 'history.shard': shardIdFromDump }, {'$set': { 'history.$[].shard': shardIdFromMlaunch }}) config.log_line(result.raw_result) # Create the collections and construct sharded indexes on all shard nodes for shard in configServerConfigDB.shards.find({}): print('Creating shard key indexes on shard ' + shard['_id']) shardConnParts = shard['host'].split('/', 1) shardConnection = MongoClient(shardConnParts[1], replicaset=shardConnParts[0]) for collection in configServerConfigDB.collections.find({'dropped': False}): collectionParts = collection['_id'].split('.', 1) dbName = collectionParts[0] collName = collectionParts[1] collUUID = collection['uuid'] if 'uuid' in collection else None shardKey = collection['key'] db = shardConnection.get_database(dbName) applyOpsCommand = { 'applyOps': [{ 'op': 'c', 'ns': dbName + '.$cmd', 'o': { 'create': collName, }, }] } if collUUID: applyOpsCommand['ui'] = collUUID config.log_line("db.adminCommand(" + str(applyOpsCommand) + ");") db.command(applyOpsCommand, codec_options=CodecOptions(uuid_representation=4)) createIndexesCommand = { 'createIndexes': collName, 'indexes': [{ 'key': shardKey, 'name': 'Shard key index' }] } config.log_line( "db.getSiblingDB(" + dbName + ").runCommand(" + str(createIndexesCommand) + ");") db.command(createIndexesCommand) shardConnection.close() # Restart the cluster so it picks up the new configuration cleanly config.mlaunch_action('restart', mongodClusterRootPath) return 0
async def main(args): cluster = Cluster(args.uri, asyncio.get_event_loop()) coll = ShardedCollection(cluster, args.ns) await coll.init() num_chunks = await cluster.configDb.chunks.count_documents( {'ns': coll.name}) print( f"""Collection {coll.name} has a shardKeyPattern of {coll.shard_key_pattern} and {num_chunks} chunks. For optimisation and for dry runs, will assume a chunk size of {args.phase_1_estimated_chunk_size_kb}KB.""" ) if not args.dryrun: await cluster.checkIsMongos() if not yes_no( 'The next steps will perform durable changes to the cluster.\n' + 'Proceed (yes/no)? '): raise KeyboardInterrupt('User canceled') ############################################################################################### # Sanity checks (Read-Only): Ensure that the balancer and auto-splitter are stopped and that the # MaxChunkSize has been configured appropriately # balancer_doc = await cluster.configDb.settings.find_one( {'_id': 'balancer'}) if not args.dryrun and (balancer_doc is None or balancer_doc['mode'] != 'off'): raise Exception( """The balancer must be stopped before running this script. Please run: sh.stopBalancer()""") auto_splitter_doc = await cluster.configDb.settings.find_one( {'_id': 'autosplit'}) if not args.dryrun and (auto_splitter_doc is None or auto_splitter_doc['enabled']): raise Exception( """The auto-splitter must be disabled before running this script. Please run: db.getSiblingDB('config').settings.update({_id:'autosplit'}, {$set: {enabled: false}}, {upsert: true})""" ) chunk_size_doc = await cluster.configDb.settings.find_one( {'_id': 'chunksize'}) if chunk_size_doc is None or chunk_size_doc['value'] < 128: if not args.dryrun: raise Exception( """The MaxChunkSize must be configured to at least 128 MB before running this script. Please run: db.getSiblingDB('config').settings.update({_id:'chunksize'}, {$set: {value: 128}}, {upsert: true})""" ) else: target_chunk_size_kb = args.dryrun else: target_chunk_size_kb = chunk_size_doc['value'] * 1024 if args.dryrun: print( f"""Performing a dry run with target chunk size of {target_chunk_size_kb}. No actual modifications to the cluster will occur.""") try: await cluster.checkIsMongos() except Cluster.NotMongosException: print('Not connected to a mongos') ############################################################################################### # Initialisation (Read-Only): Fetch all chunks in memory and calculate the collection version # in preparation for the subsequent write phase. ############################################################################################### shard_to_chunks = {} collectionVersion = None with tqdm(total=num_chunks, unit=' chunks') as progress: async for c in cluster.configDb.chunks.find({'ns': coll.name}, sort=[('min', pymongo.ASCENDING) ]): shardId = c['shard'] if collectionVersion is None: collectionVersion = c['lastmod'] if c['lastmod'] > collectionVersion: collectionVersion = c['lastmod'] if shardId not in shard_to_chunks: shard_to_chunks[shardId] = { 'chunks': [], 'num_merges_performed': 0 } shard = shard_to_chunks[shardId] shard['chunks'].append(c) progress.update() print( f'Collection version is {collectionVersion} and chunks are spread over {len(shard_to_chunks)} shards' ) ############################################################################################### # # WRITE PHASES START FROM HERE ONWARDS # ############################################################################################### ############################################################################################### # PHASE 1 (Merge-only): The purpose of this phase is to merge as many chunks as possible without # actually moving any data. It is intended to achieve the maximum number of merged chunks with # the minimum possible intrusion to the ongoing CRUD workload due to refresh stalls. # # The stage is also resumable, because for every chunk/chunk range that it processes, it will # persist a field called 'defrag_collection_est_size' on the chunk, which estimates its size as # of the time the script ran. Resuming Phase 1 will skip over any chunks which already contain # this field, because it indicates that previous execution already ran and performed all the # possible merges. # # These are the parameters that control the operation of this phase and their purpose is # explaned below: max_merges_on_shards_at_less_than_collection_version = 1 max_merges_on_shards_at_collection_version = 10 # The way Phase 1 (merge-only) operates is by running: # # (1) Up to `max_merges_on_shards_at_less_than_collection_version` concurrent mergeChunks # across all shards which are below the collection major version # AND # (2) Up to `max_merges_on_shards_at_collection_version` concurrent mergeChunks across all # shards which are already on the collection major version # # Merges due to (1) will bring the respective shard's major version to that of the collection, # which unfortunately is interpreted by the routers as "something routing-related changed" and # will result in refresh and a stall on the critical CRUD path. Because of this, the script only # runs one at a time of these by default. On the other hand, merges due to (2) only increment # the minor version and will not cause stalls on the CRUD path, so these can run with higher # concurrency. # # The expectation is that at the end of this phase, not all possible defragmentation would have # been achieved, but the number of chunks on the cluster would have been significantly reduced # in a way that would make Phase 2 much less invasive due to refreshes after moveChunk. # # For example in a collection with 1 million chunks, a refresh due to moveChunk could be # expected to take up to a second. However with the number of chunks reduced to 500,000 due to # Phase 1, the refresh time would be on the order of ~100-200msec. ############################################################################################### sem_at_less_than_collection_version = asyncio.Semaphore( max_merges_on_shards_at_less_than_collection_version) sem_at_collection_version = asyncio.Semaphore( max_merges_on_shards_at_collection_version) async def merge_chunks_on_shard(shard, collection_version, progress): shard_entry = shard_to_chunks[shard] shard_chunks = shard_entry['chunks'] if len(shard_chunks) == 0: return chunk_at_shard_version = max(shard_chunks, key=lambda c: c['lastmod']) shard_version = chunk_at_shard_version['lastmod'] shard_is_at_collection_version = shard_version.time == collection_version.time progress.write(f'{shard}: {shard_version}: ', end='') if shard_is_at_collection_version: progress.write('Merge will start without major version bump') else: progress.write('Merge will start with a major version bump') consecutive_chunks = [] estimated_size_of_consecutive_chunks = 0 num_lock_busy_errors_encountered = 0 for c in shard_chunks: progress.update() if len(consecutive_chunks) == 0: consecutive_chunks = [c] estimated_size_of_consecutive_chunks = args.phase_1_estimated_chunk_size_kb continue merge_consecutive_chunks_without_size_check = False if consecutive_chunks[-1]['max'] == c['min']: consecutive_chunks.append(c) estimated_size_of_consecutive_chunks += args.phase_1_estimated_chunk_size_kb elif len(consecutive_chunks) == 1: if not args.dryrun and not 'defrag_collection_est_size' in consecutive_chunks[ -1]: chunk_range = [ consecutive_chunks[-1]['min'], consecutive_chunks[-1]['max'] ] data_size = await coll.data_size_kb(chunk_range) await coll.try_write_chunk_size(chunk_range, shard, data_size) consecutive_chunks = [c] estimated_size_of_consecutive_chunks = args.phase_1_estimated_chunk_size_kb continue else: merge_consecutive_chunks_without_size_check = True # After we have collected a run of chunks whose estimated size is 90% of the maximum # chunk size, invoke `splitVector` in order to determine whether we can merge them or # if we should continue adding more chunks to be merged if (estimated_size_of_consecutive_chunks <= target_chunk_size_kb * 0.90) and not merge_consecutive_chunks_without_size_check: continue merge_bounds = [ consecutive_chunks[0]['min'], consecutive_chunks[-1]['max'] ] # Determine the "exact" (not 100% exact because we use the 'estimate' option) size of # the currently accumulated bounds via the `dataSize` command in order to decide # whether this run should be merged or if we should continue adding chunks to it. if not args.dryrun: actual_size_of_consecutive_chunks = await coll.data_size_kb( merge_bounds ) if not args.dryrun else estimated_size_of_consecutive_chunks else: actual_size_of_consecutive_chunks = estimated_size_of_consecutive_chunks if merge_consecutive_chunks_without_size_check: pass elif actual_size_of_consecutive_chunks < target_chunk_size_kb * 0.75: # If the actual range size is sill 25% less than the target size, continue adding # consecutive chunks estimated_size_of_consecutive_chunks = actual_size_of_consecutive_chunks continue elif actual_size_of_consecutive_chunks > target_chunk_size_kb * 1.10: # TODO: If the actual range size is 10% more than the target size, use `splitVector` # to determine a better merge/split sequence so as not to generate huge chunks which # will have to be split later on pass # Perform the actual merge, obeying the configured concurrency async with (sem_at_collection_version if shard_is_at_collection_version else sem_at_less_than_collection_version): if not args.dryrun: try: await coll.merge_chunks(merge_bounds) await coll.try_write_chunk_size( merge_bounds, shard, actual_size_of_consecutive_chunks) except pymongo_errors.OperationFailure as ex: if ex.details['code'] == 46: # The code for LockBusy num_lock_busy_errors_encountered += 1 if num_lock_busy_errors_encountered == 1: logging.warning( f"""Lock error occurred while trying to merge chunk range {merge_bounds}. This indicates the presence of an older MongoDB version.""" ) else: raise else: progress.write( f'Merging {len(consecutive_chunks)} consecutive chunks on {shard}: {merge_bounds}' ) if merge_consecutive_chunks_without_size_check: consecutive_chunks = [c] estimated_size_of_consecutive_chunks = args.phase_1_estimated_chunk_size_kb else: consecutive_chunks = [] estimated_size_of_consecutive_chunks = 0 shard_entry['num_merges_performed'] += 1 shard_is_at_collection_version = True with tqdm(total=num_chunks, unit=' chunks') as progress: tasks = [] for s in shard_to_chunks: tasks.append( asyncio.ensure_future( merge_chunks_on_shard(s, collectionVersion, progress))) await asyncio.gather(*tasks)