def __cleanup_previous_runs(self):
        if (not yes_no(
                'The next step will kill all mongodb processes and wipe out the data path.\n'
                + 'Proceed (yes/no)? ')):
            raise KeyboardInterrupt('User disallowed cleanup of the data path')

        # Iterate through all processes and kill mongod and mongos
        for process in psutil.process_iter():
            try:
                processExecutable = os.path.basename(process.exe())
            except psutil.NoSuchProcess:
                pass
            except psutil.AccessDenied:
                pass
            else:
                if (processExecutable
                        in [exe_name('mongod'),
                            exe_name('mongos')]):
                    process.kill()
                    process.wait()

        # Remove the output directories
        try:
            shutil.rmtree(self.introspectRoot)
        except FileNotFoundError:
            pass

        try:
            shutil.rmtree(self.clusterRoot)
        except FileNotFoundError:
            pass
    def __cleanup_previous_runs(self):
        if (not yes_no('The next step will kill all mongodb processes and wipe out the data path.\n'
                       + 'Proceed (yes/no)? ')):
            return False

        # Iterate through all processes and kill mongod and mongos
        for process in psutil.process_iter():
            try:
                processExecutable = os.path.basename(process.exe())
            except psutil.NoSuchProcess:
                pass
            except psutil.AccessDenied:
                pass
            else:
                if (processExecutable in [exe_name('mongod'), exe_name('mongos')]):
                    process.kill()
                    process.wait()

        # Remove the output directory
        shutil.rmtree(self.dir)
        return True
    def __init__(self, config, introspect):
        self._config = config
        self._introspect = introspect

        if config.numShards is None:
            numShards = introspect.configDb.shards.count_documents({})
        else:
            numShards = config.numShards

        if (numShards > 10):
            if (not yes_no(
                    'The imported configuration data contains large number of shards ('
                    + str(numShards) +
                    '). Proceeding will start large number of mongod processes.\n'
                    + 'Are you sure you want to continue (yes/no)? ')):
                raise KeyboardInterrupt('Too many shards will be created')

        config.mlaunch_action('init', config.clusterRoot, [
            '--sharded',
            str(numShards), '--replicaset', '--nodes', '1', '--csrs',
            '--mongos', '1', '--port',
            str(config.clusterStartingPort), '--wiredTigerCacheSizeGB', '0.25',
            '--oplogSize', '50'
        ])

        # Set the correct FCV on the cluster being reconstructed
        clusterConnection = MongoClient('localhost',
                                        config.clusterStartingPort)
        self._config.log_line(
            clusterConnection.admin.command('setFeatureCompatibilityVersion',
                                            introspect.FCV))

        # TODO: Find a better way to determine the port of the config server's primary
        self.configServerPort = config.clusterStartingPort + (numShards + 1)

        configServerConnection = MongoClient('localhost',
                                             self.configServerPort)
        self.configDb = configServerConnection.config
def main():
    argsParser = argparse.ArgumentParser(
        description=
        'Tool to interpret an export of a cluster config database and construct a new cluster with'
        'exactly the same configuration. Requires mlaunch to be installed and in the system path.')
    argsParser.add_argument('--binarypath', help='Directory containing the MongoDB binaries',
                            metavar='binarypath', type=str, required=True)
    argsParser.add_argument(
        '--dir', help='Directory in which to place the data files (will create subdirectories)',
        metavar='dir', type=str, required=True)
    argsParser.add_argument('configdumpdir',
                            help='Directory containing a dump of the cluster config database',
                            metavar='configdumpdir', type=str, nargs=1)

    config = ToolConfiguration(argsParser.parse_args())

    # Read the cluster configuration from the preprocess instance and construct the new cluster
    introspect = ClusterIntrospect(config)

    numShards = introspect.configDb.shards.count({})
    if (numShards > 10):
        if (not yes_no('The imported configuration data contains large number of shards (' + str(
                numShards) + '). Proceeding will start large number of mongod processes.\n' +
                       'Are you sure you want to continue (yes/no)? ')):
            return 1

    # Make the output directories
    mongodClusterRootPath = os.path.join(config.dir, 'cluster_root')
    os.makedirs(mongodClusterRootPath)

    config.mlaunch_action('init', mongodClusterRootPath, [
        '--replicaset', '--nodes', '1', '--sharded',
        str(numShards), '--csrs', '--mongos', '1', '--port',
        str(config.clusterStartingPort)
    ])

    configServerPort = config.clusterStartingPort + numShards + 1
    config.restore_config_db_to_port(configServerPort)

    configServerConnection = MongoClient('localhost', configServerPort)
    configServerConfigDB = configServerConnection.config

    # Rename the shards from the dump to the shards launched by mlaunch in the shards collection
    print('Renaming shards in the shards collection:')

    # Shards that came with the config dump
    SHARDS_FROM_DUMP = list(introspect.configDb.shards.find({}).sort('_id', 1))

    # Shards that mlaunch generated (should be the same size as SHARDS_FROM_DUMP)
    SHARDS_FROM_MLAUNCH = list(
        configServerConfigDB.shards.find({
            '_id': {
                '$not': {
                    '$in': list(map(lambda x: x['_id'], SHARDS_FROM_DUMP))
                }
            }
        }).sort('_id', 1))

    assert (len(SHARDS_FROM_DUMP) == len(SHARDS_FROM_MLAUNCH))

    shardsToInsert = []
    for shardFromDump, shardFromMlaunch in zip(deepcopy(SHARDS_FROM_DUMP), SHARDS_FROM_MLAUNCH):
        shardFromDump['_id'] = shardFromMlaunch['_id']
        shardFromDump['host'] = shardFromMlaunch['host']
        shardsToInsert.append(shardFromDump)

    # Wipe out all the shards (both from the dump and from mlaunch)
    result = configServerConfigDB.shards.delete_many({})
    config.log_line(result.raw_result)

    # Patch the _id and host of the shards in the config dump
    for shardToInsert in shardsToInsert:
        result = configServerConfigDB.shards.insert(shardToInsert)
        config.log_line(result)

    # Rename the shards from the dump to the shards launched by mlaunch in the metadata
    print('Renaming shards in the routing metadata:')
    for shardIdFromDump, shardIdFromMlaunch in zip(
            list(map(lambda x: x['_id'], SHARDS_FROM_DUMP)),
            list(map(lambda x: x['_id'], SHARDS_FROM_MLAUNCH))):
        print('Shard ' + shardIdFromDump + ' becomes ' + shardIdFromMlaunch)

        result = configServerConfigDB.databases.update_many({
            'primary': shardIdFromDump
        }, {'$set': {
            'primary': shardIdFromMlaunch
        }})
        config.log_line(result.raw_result)

        # Rename the shards in the chunks' current owner field
        result = configServerConfigDB.chunks.update_many({
            'shard': shardIdFromDump
        }, {'$set': {
            'shard': shardIdFromMlaunch
        }})
        config.log_line(result.raw_result)

        # Rename the shards in the chunks' history
        result = configServerConfigDB.chunks.update_many({
            'history.shard': shardIdFromDump
        }, {'$set': {
            'history.$[].shard': shardIdFromMlaunch
        }})
        config.log_line(result.raw_result)

    # Create the collections and construct sharded indexes on all shard nodes
    for shard in configServerConfigDB.shards.find({}):
        print('Creating shard key indexes on shard ' + shard['_id'])

        shardConnParts = shard['host'].split('/', 1)
        shardConnection = MongoClient(shardConnParts[1], replicaset=shardConnParts[0])

        for collection in configServerConfigDB.collections.find({'dropped': False}):
            collectionParts = collection['_id'].split('.', 1)
            dbName = collectionParts[0]
            collName = collectionParts[1]
            collUUID = collection['uuid'] if 'uuid' in collection else None

            shardKey = collection['key']

            db = shardConnection.get_database(dbName)

            applyOpsCommand = {
                'applyOps': [{
                    'op': 'c',
                    'ns': dbName + '.$cmd',
                    'o': {
                        'create': collName,
                    },
                }]
            }

            if collUUID:
                applyOpsCommand['ui'] = collUUID

            config.log_line("db.adminCommand(" + str(applyOpsCommand) + ");")
            db.command(applyOpsCommand, codec_options=CodecOptions(uuid_representation=4))

            createIndexesCommand = {
                'createIndexes': collName,
                'indexes': [{
                    'key': shardKey,
                    'name': 'Shard key index'
                }]
            }
            config.log_line(
                "db.getSiblingDB(" + dbName + ").runCommand(" + str(createIndexesCommand) + ");")
            db.command(createIndexesCommand)

        shardConnection.close()

    # Restart the cluster so it picks up the new configuration cleanly
    config.mlaunch_action('restart', mongodClusterRootPath)

    return 0
Beispiel #5
0
async def main(args):
    cluster = Cluster(args.uri, asyncio.get_event_loop())
    coll = ShardedCollection(cluster, args.ns)
    await coll.init()

    num_chunks = await cluster.configDb.chunks.count_documents(
        {'ns': coll.name})
    print(
        f"""Collection {coll.name} has a shardKeyPattern of {coll.shard_key_pattern} and {num_chunks} chunks.
            For optimisation and for dry runs, will assume a chunk size of {args.phase_1_estimated_chunk_size_kb}KB."""
    )

    if not args.dryrun:
        await cluster.checkIsMongos()
        if not yes_no(
                'The next steps will perform durable changes to the cluster.\n'
                + 'Proceed (yes/no)? '):
            raise KeyboardInterrupt('User canceled')

    ###############################################################################################
    # Sanity checks (Read-Only): Ensure that the balancer and auto-splitter are stopped and that the
    # MaxChunkSize has been configured appropriately
    #
    balancer_doc = await cluster.configDb.settings.find_one(
        {'_id': 'balancer'})
    if not args.dryrun and (balancer_doc is None
                            or balancer_doc['mode'] != 'off'):
        raise Exception(
            """The balancer must be stopped before running this script. Please run:
                           sh.stopBalancer()""")

    auto_splitter_doc = await cluster.configDb.settings.find_one(
        {'_id': 'autosplit'})
    if not args.dryrun and (auto_splitter_doc is None
                            or auto_splitter_doc['enabled']):
        raise Exception(
            """The auto-splitter must be disabled before running this script. Please run:
               db.getSiblingDB('config').settings.update({_id:'autosplit'}, {$set: {enabled: false}}, {upsert: true})"""
        )

    chunk_size_doc = await cluster.configDb.settings.find_one(
        {'_id': 'chunksize'})
    if chunk_size_doc is None or chunk_size_doc['value'] < 128:
        if not args.dryrun:
            raise Exception(
                """The MaxChunkSize must be configured to at least 128 MB before running this script. Please run:
                   db.getSiblingDB('config').settings.update({_id:'chunksize'}, {$set: {value: 128}}, {upsert: true})"""
            )
        else:
            target_chunk_size_kb = args.dryrun
    else:
        target_chunk_size_kb = chunk_size_doc['value'] * 1024

    if args.dryrun:
        print(
            f"""Performing a dry run with target chunk size of {target_chunk_size_kb}.
                  No actual modifications to the cluster will occur.""")
        try:
            await cluster.checkIsMongos()
        except Cluster.NotMongosException:
            print('Not connected to a mongos')

    ###############################################################################################
    # Initialisation (Read-Only): Fetch all chunks in memory and calculate the collection version
    # in preparation for the subsequent write phase.
    ###############################################################################################

    shard_to_chunks = {}
    collectionVersion = None

    with tqdm(total=num_chunks, unit=' chunks') as progress:
        async for c in cluster.configDb.chunks.find({'ns': coll.name},
                                                    sort=[('min',
                                                           pymongo.ASCENDING)
                                                          ]):
            shardId = c['shard']
            if collectionVersion is None:
                collectionVersion = c['lastmod']
            if c['lastmod'] > collectionVersion:
                collectionVersion = c['lastmod']
            if shardId not in shard_to_chunks:
                shard_to_chunks[shardId] = {
                    'chunks': [],
                    'num_merges_performed': 0
                }
            shard = shard_to_chunks[shardId]
            shard['chunks'].append(c)
            progress.update()

    print(
        f'Collection version is {collectionVersion} and chunks are spread over {len(shard_to_chunks)} shards'
    )

    ###############################################################################################
    #
    # WRITE PHASES START FROM HERE ONWARDS
    #
    ###############################################################################################

    ###############################################################################################
    # PHASE 1 (Merge-only): The purpose of this phase is to merge as many chunks as possible without
    # actually moving any data. It is intended to achieve the maximum number of merged chunks with
    # the minimum possible intrusion to the ongoing CRUD workload due to refresh stalls.
    #
    # The stage is also resumable, because for every chunk/chunk range that it processes, it will
    # persist a field called 'defrag_collection_est_size' on the chunk, which estimates its size as
    # of the time the script ran. Resuming Phase 1 will skip over any chunks which already contain
    # this field, because it indicates that previous execution already ran and performed all the
    # possible merges.
    #
    # These are the parameters that control the operation of this phase and their purpose is
    # explaned below:

    max_merges_on_shards_at_less_than_collection_version = 1
    max_merges_on_shards_at_collection_version = 10

    # The way Phase 1 (merge-only) operates is by running:
    #
    #   (1) Up to `max_merges_on_shards_at_less_than_collection_version` concurrent mergeChunks
    #       across all shards which are below the collection major version
    #           AND
    #   (2) Up to `max_merges_on_shards_at_collection_version` concurrent mergeChunks across all
    #       shards which are already on the collection major version
    #
    # Merges due to (1) will bring the respective shard's major version to that of the collection,
    # which unfortunately is interpreted by the routers as "something routing-related changed" and
    # will result in refresh and a stall on the critical CRUD path. Because of this, the script only
    # runs one at a time of these by default. On the other hand, merges due to (2) only increment
    # the minor version and will not cause stalls on the CRUD path, so these can run with higher
    # concurrency.
    #
    # The expectation is that at the end of this phase, not all possible defragmentation would have
    # been achieved, but the number of chunks on the cluster would have been significantly reduced
    # in a way that would make Phase 2 much less invasive due to refreshes after moveChunk.
    #
    # For example in a collection with 1 million chunks, a refresh due to moveChunk could be
    # expected to take up to a second. However with the number of chunks reduced to 500,000 due to
    # Phase 1, the refresh time would be on the order of ~100-200msec.
    ###############################################################################################

    sem_at_less_than_collection_version = asyncio.Semaphore(
        max_merges_on_shards_at_less_than_collection_version)
    sem_at_collection_version = asyncio.Semaphore(
        max_merges_on_shards_at_collection_version)

    async def merge_chunks_on_shard(shard, collection_version, progress):
        shard_entry = shard_to_chunks[shard]
        shard_chunks = shard_entry['chunks']
        if len(shard_chunks) == 0:
            return

        chunk_at_shard_version = max(shard_chunks, key=lambda c: c['lastmod'])
        shard_version = chunk_at_shard_version['lastmod']
        shard_is_at_collection_version = shard_version.time == collection_version.time
        progress.write(f'{shard}: {shard_version}: ', end='')
        if shard_is_at_collection_version:
            progress.write('Merge will start without major version bump')
        else:
            progress.write('Merge will start with a major version bump')

        consecutive_chunks = []
        estimated_size_of_consecutive_chunks = 0

        num_lock_busy_errors_encountered = 0

        for c in shard_chunks:
            progress.update()

            if len(consecutive_chunks) == 0:
                consecutive_chunks = [c]
                estimated_size_of_consecutive_chunks = args.phase_1_estimated_chunk_size_kb
                continue

            merge_consecutive_chunks_without_size_check = False

            if consecutive_chunks[-1]['max'] == c['min']:
                consecutive_chunks.append(c)
                estimated_size_of_consecutive_chunks += args.phase_1_estimated_chunk_size_kb
            elif len(consecutive_chunks) == 1:
                if not args.dryrun and not 'defrag_collection_est_size' in consecutive_chunks[
                        -1]:
                    chunk_range = [
                        consecutive_chunks[-1]['min'],
                        consecutive_chunks[-1]['max']
                    ]
                    data_size = await coll.data_size_kb(chunk_range)
                    await coll.try_write_chunk_size(chunk_range, shard,
                                                    data_size)

                consecutive_chunks = [c]
                estimated_size_of_consecutive_chunks = args.phase_1_estimated_chunk_size_kb
                continue
            else:
                merge_consecutive_chunks_without_size_check = True

            # After we have collected a run of chunks whose estimated size is 90% of the maximum
            # chunk size, invoke `splitVector` in order to determine whether we can merge them or
            # if we should continue adding more chunks to be merged
            if (estimated_size_of_consecutive_chunks <= target_chunk_size_kb *
                    0.90) and not merge_consecutive_chunks_without_size_check:
                continue

            merge_bounds = [
                consecutive_chunks[0]['min'], consecutive_chunks[-1]['max']
            ]

            # Determine the "exact" (not 100% exact because we use the 'estimate' option) size of
            # the currently accumulated bounds via the `dataSize` command in order to decide
            # whether this run should be merged or if we should continue adding chunks to it.
            if not args.dryrun:
                actual_size_of_consecutive_chunks = await coll.data_size_kb(
                    merge_bounds
                ) if not args.dryrun else estimated_size_of_consecutive_chunks
            else:
                actual_size_of_consecutive_chunks = estimated_size_of_consecutive_chunks

            if merge_consecutive_chunks_without_size_check:
                pass
            elif actual_size_of_consecutive_chunks < target_chunk_size_kb * 0.75:
                # If the actual range size is sill 25% less than the target size, continue adding
                # consecutive chunks
                estimated_size_of_consecutive_chunks = actual_size_of_consecutive_chunks
                continue
            elif actual_size_of_consecutive_chunks > target_chunk_size_kb * 1.10:
                # TODO: If the actual range size is 10% more than the target size, use `splitVector`
                # to determine a better merge/split sequence so as not to generate huge chunks which
                # will have to be split later on
                pass

            # Perform the actual merge, obeying the configured concurrency
            async with (sem_at_collection_version
                        if shard_is_at_collection_version else
                        sem_at_less_than_collection_version):
                if not args.dryrun:
                    try:
                        await coll.merge_chunks(merge_bounds)
                        await coll.try_write_chunk_size(
                            merge_bounds, shard,
                            actual_size_of_consecutive_chunks)
                    except pymongo_errors.OperationFailure as ex:
                        if ex.details['code'] == 46:  # The code for LockBusy
                            num_lock_busy_errors_encountered += 1
                            if num_lock_busy_errors_encountered == 1:
                                logging.warning(
                                    f"""Lock error occurred while trying to merge chunk range {merge_bounds}.
                                        This indicates the presence of an older MongoDB version."""
                                )
                        else:
                            raise
                else:
                    progress.write(
                        f'Merging {len(consecutive_chunks)} consecutive chunks on {shard}: {merge_bounds}'
                    )

            if merge_consecutive_chunks_without_size_check:
                consecutive_chunks = [c]
                estimated_size_of_consecutive_chunks = args.phase_1_estimated_chunk_size_kb
            else:
                consecutive_chunks = []
                estimated_size_of_consecutive_chunks = 0

            shard_entry['num_merges_performed'] += 1
            shard_is_at_collection_version = True

    with tqdm(total=num_chunks, unit=' chunks') as progress:
        tasks = []
        for s in shard_to_chunks:
            tasks.append(
                asyncio.ensure_future(
                    merge_chunks_on_shard(s, collectionVersion, progress)))
        await asyncio.gather(*tasks)