Example #1
0
def _do_copy(collection_name, shard_key):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find({
        'realm': realm['name'],
        'shard_key': shard_key
    })
    if shard_metadata['status'] != metadata.ShardStatus.MIGRATING_COPY:
        raise Exception('Shard not in copy state (phase 1)')

    current_location = shard_metadata['location']
    new_location = shard_metadata['new_location']

    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    new_collection = _get_collection_from_location_string(
        new_location, collection_name)

    query = {shard_field: shard_key}
    for record in current_collection.find(query):
        new_collection.insert(record, safe=False)

    result = new_collection.database.command('getLastError')
    if result['ok'] != 1:
        raise Exception('Failed to do copy! Mongo error: %s' % result['err'])
Example #2
0
    def test_caching(self, mock_get_realm_coll):
        shard_data = {'shard_field': 'domain'}
        mock_get_realm_coll.return_value.find.return_value = [shard_data]

        result = metadata._get_realm_for_collection('bob')
        self.assertEquals(shard_data, result)
        self.assertEquals(1, mock_get_realm_coll.call_count)

        result = metadata._get_realm_for_collection('bob')
        self.assertEquals(shard_data, result)
        self.assertEquals(1, mock_get_realm_coll.call_count)

        time.sleep(self._cache_length * 2)
        result = metadata._get_realm_for_collection('bob')
        self.assertEquals(shard_data, result)
        self.assertEquals(2, mock_get_realm_coll.call_count)
Example #3
0
def multishard_insert(
        collection_name, doc_or_docs, with_options={}, *args, **kwargs):
    # TODO Remove this and use insert_one/insert_many to comply with new
    # pymongo deprecations
    is_multi_insert = isinstance(doc_or_docs, list)
    if not is_multi_insert:
        all_docs = [doc_or_docs]
    else:
        all_docs = doc_or_docs

    _wait_for_pause_to_end(collection_name, doc_or_docs)
    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']
    for doc in all_docs:
        if shard_field not in doc:
            raise Exception(
                'Cannot insert document without shard field (%s) present'
                % shard_field)

    # Inserts can use our generic collection iterator with a specific query
    # that is guaranteed to return exactly one collection.
    # TODO This makes a multi-insert into lots of small inserts. This could be
    # optimised. For now, we'll see if this is OK.
    result = []
    for doc in all_docs:
        simple_query = {shard_field: doc[shard_field]}
        (collection, _), = _create_collection_iterator(
            collection_name, simple_query, with_options)
        result.append(collection.insert(doc, *args, **kwargs))
    if not is_multi_insert:
        return result[0]
    return result
Example #4
0
    def test_caching(self, mock_get_realm_coll):
        shard_data = {'shard_field': 'domain'}
        mock_get_realm_coll.return_value.find.return_value = [shard_data]

        result = metadata._get_realm_for_collection('bob')
        self.assertEquals(shard_data, result)
        self.assertEquals(1, mock_get_realm_coll.call_count)

        result = metadata._get_realm_for_collection('bob')
        self.assertEquals(shard_data, result)
        self.assertEquals(1, mock_get_realm_coll.call_count)
        
        time.sleep(self._cache_length * 2)
        result = metadata._get_realm_for_collection('bob')
        self.assertEquals(shard_data, result)
        self.assertEquals(2, mock_get_realm_coll.call_count)
Example #5
0
def _do_copy(collection_name, shard_key, manager):
    realm = metadata._get_realm_for_collection(collection_name)

    shard_metadata, = api._get_shards_coll().find({
        'realm': realm['name'],
        'shard_key': shard_key
    })
    if shard_metadata['status'] != metadata.ShardStatus.MIGRATING_COPY:
        raise Exception('Shard not in copy state (phase 1)')

    current_collection = _get_collection_from_location_string(
        shard_metadata['location'], collection_name)
    new_collection = _get_collection_from_location_string(
        shard_metadata['new_location'], collection_name)
    target_key = sniff_mongos_shard_key(new_collection) or ['_id']

    cursor = current_collection.find({realm['shard_field']: shard_key},
                                     no_cursor_timeout=True)
    try:
        # manager.insert_throttle and manager.insert_batch_size can change
        # in other thread so we reference them on each cycle
        for batch in batched_cursor_iterator(
                cursor, lambda: manager.insert_batch_size):
            try:
                result = new_collection.bulk_write(batch_of_upsert_ops(
                    batch, target_key),
                                                   ordered=True)
            except BulkWriteError as e:
                pretty_log(e.details)
                raise
            tum_ti_tum(manager.insert_throttle)
            manager.inc_inserted(by=result.bulk_api_result['nUpserted'])
    finally:
        cursor.close()
Example #6
0
def _delete_source_data(collection_name, shard_key, delete_throttle=None):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find(
        {'realm': realm['name'], 'shard_key': shard_key})
    if shard_metadata['status'] != metadata.ShardStatus.POST_MIGRATION_DELETE:
        raise Exception('Shard not in delete state')

    current_location = shard_metadata['location']
    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    cursor = current_collection.find(
            {shard_field: shard_key}, {'_id': 1},
            no_cursor_timeout=True)
    deleted = 0
    try:
        for doc in cursor:
            current_collection.remove({'_id': doc['_id']})
            if delete_throttle:
                time.sleep(delete_throttle)
            if deleted % 10000 == 0:
                _detail_log('%d records deleted' % deleted)
            deleted += 1

    finally:
        cursor.close()
Example #7
0
def _delete_source_data(collection_name, shard_key, manager):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find(
        {'realm': realm['name'], 'shard_key': shard_key})
    if shard_metadata['status'] != metadata.ShardStatus.POST_MIGRATION_DELETE:
        raise Exception('Shard not in delete state')

    current_location = shard_metadata['location']
    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    cursor = current_collection.find(
            {shard_field: shard_key}, {'_id': 1},
            no_cursor_timeout=True)
    try:
        for doc in cursor:
            current_collection.remove({'_id': doc['_id']})

            # Get the delete throttle out of the manager. This allows for the
            # insert throttle to be changed by another thread whilst maintaining
            # thread safety.
            delete_throttle = manager.delete_throttle
            if delete_throttle:
                time.sleep(delete_throttle)
            manager.inc_deleted()

    finally:
        cursor.close()
Example #8
0
def _delete_source_data(collection_name, shard_key, manager):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find({
        'realm': realm['name'],
        'shard_key': shard_key
    })
    if shard_metadata['status'] != metadata.ShardStatus.POST_MIGRATION_DELETE:
        raise Exception('Shard not in delete state')

    current_location = shard_metadata['location']
    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    cursor = current_collection.find({shard_field: shard_key}, {'_id': 1},
                                     no_cursor_timeout=True)
    try:
        # manager.insert_throttle and manager.insert_batch_size can change
        # in other thread so we reference them on each cycle
        for batch in batched_cursor_iterator(
                cursor, lambda: manager.delete_batch_size):
            _ids = [record['_id'] for record in batch]
            result = current_collection.delete_many({'_id': {'$in': _ids}})
            tum_ti_tum(manager.delete_throttle)
            manager.inc_deleted(by=result.raw_result['n'])
    finally:
        cursor.close()
Example #9
0
def _create_collection_iterator(collection_name, query):
    """Creates an iterator that returns collections and queries that can then
    be used to perform multishard operations:

        for collection, query in _create_collection_iterator(...):
            for doc in collection.find(query):
                yield doc

    This does all the hardwork of figuring out what collections to query and how
    to adjust the query to account for any shards that are currently moving.
    """
    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shard_key = _get_query_target(collection_name, query)
    if shard_key:
        location = _get_location_for_shard(realm, shard_key)
        locations = {location.location: location}
    else:
        locations = _get_all_locations_for_realm(realm)

    for location, location_meta in locations.iteritems():
        cluster_name, database_name = location.split('/')
        connection = get_connection(cluster_name)
        collection = connection[database_name][collection_name]
        if location_meta.excludes:
            if len(location_meta.excludes) == 1:
                query = {'$and': [
                    query, {shard_field: {'$ne': location_meta.excludes[0]}}]}
            else:
                raise Exception('Multiple shards in transit. Aborting')
        yield collection, query
        if location_meta.excludes:
            query = query['$and'][0]
Example #10
0
def multishard_aggregate(collection_name,
                         pipeline,
                         with_options={},
                         *args,
                         **kwargs):
    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']
    if '$match' not in pipeline[0]:
        raise Exception(
            'Sharded aggregation needs match in the first part of the pipeline'
        )
    if shard_field not in pipeline[0]['$match']:
        raise Exception(
            'Cannot perform aggregation without shard field (%s) present' %
            shard_field)

    # To avoid aggregation needing to be recreated in this client we limit
    # aggregation to only one cluster.
    match_query = pipeline[0]['$match']
    (collection, _, _), = _create_collection_iterator(collection_name,
                                                      match_query,
                                                      with_options)

    # TODO: useCursor needs to be False until support for Mongo2.4 is removed
    return collection.aggregate(pipeline, useCursor=False, *args, **kwargs)
Example #11
0
def _do_copy(collection_name, shard_key):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find(
        {'realm': realm['name'], 'shard_key': shard_key})
    if shard_metadata['status'] != metadata.ShardStatus.MIGRATING_COPY:
        raise Exception('Shard not in copy state (phase 1)')

    current_location = shard_metadata['location']
    new_location = shard_metadata['new_location']

    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    new_collection = _get_collection_from_location_string(
        new_location, collection_name)

    query = {shard_field: shard_key}
    for record in current_collection.find(query):
        new_collection.insert(record, safe=False)

    result = new_collection.database.command('getLastError')
    if result['ok'] != 1:
        raise Exception('Failed to do copy! Mongo error: %s' % result['err'])
Example #12
0
def _delete_source_data(collection_name, shard_key, delete_throttle=None):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find({
        'realm': realm['name'],
        'shard_key': shard_key
    })
    if shard_metadata['status'] != metadata.ShardStatus.POST_MIGRATION_DELETE:
        raise Exception('Shard not in delete state')

    current_location = shard_metadata['location']
    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    cursor = current_collection.find({shard_field: shard_key}, {'_id': 1},
                                     no_cursor_timeout=True)
    deleted = 0
    try:
        for doc in cursor:
            current_collection.remove({'_id': doc['_id']})
            if delete_throttle:
                time.sleep(delete_throttle)
            if deleted % 10000 == 0:
                _detail_log('%d records deleted' % deleted)
            deleted += 1

    finally:
        cursor.close()
Example #13
0
def _get_query_target(collection_name, query):
    """Gets out the targetted shard key from the query if there is one.
    Otherwise, returns None.
    """
    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    if shard_field in query and _is_valid_type_for_sharding(query[shard_field]):
        return query[shard_field]
    return None
Example #14
0
def where_is(collection_name, shard_key):
    """Returns a string of the form cluster/database that says where a
    particular shard of data resides.

    :param collection_name: The collection name for the shard
    :param shard_key: The shard key to look for
    """
    realm = _get_realm_for_collection(collection_name)
    location = _get_location_for_shard(realm, shard_key)
    return location.location
Example #15
0
def where_is(collection_name, shard_key):
    """Returns a string of the form cluster/database that says where a
    particular shard of data resides.

    :param collection_name: The collection name for the shard
    :param shard_key: The shard key to look for
    """
    realm = _get_realm_for_collection(collection_name)
    location = _get_location_for_shard(realm, shard_key)
    return location.location
Example #16
0
    def test_ensure_realm_exists(self):
        # Trying to get a none-existent realm should blow up
        with self.assertRaises(Exception) as catcher:
            realm = _get_realm_for_collection('some_collection')
        self.assertEquals(
            catcher.exception.message,
            'Realm for collection some_collection does not exist')

        ensure_realm_exists('some_realm', 'some_field', 'some_collection')
        realm = _get_realm_for_collection('some_collection')
        self.assertEquals('some_realm', realm['name'])

        # Try creating the realm again, ensure it doesn't blow up or create a
        # duplicate
        ensure_realm_exists('some_realm', 'some_field', 'some_collection')
        realm = _get_realm_for_collection('some_collection')
        self.assertEquals('some_realm', realm['name'])

        coll = _get_realm_coll()
        self.assertEquals(2, coll.count())  # One realm exists due to test base
Example #17
0
    def test_ensure_realm_exists(self):
        # Trying to get a none-existent realm should blow up
        with self.assertRaises(Exception) as catcher:
            realm = _get_realm_for_collection('some_collection')
        self.assertEquals(
            catcher.exception.message,
            'Realm for collection some_collection does not exist')

        ensure_realm_exists('some_realm', 'some_field', 'some_collection')
        realm = _get_realm_for_collection('some_collection')
        self.assertEquals('some_realm', realm['name'])

        # Try creating the realm again, ensure it doesn't blow up or create a
        # duplicate
        ensure_realm_exists('some_realm', 'some_field', 'some_collection')
        realm = _get_realm_for_collection('some_collection')
        self.assertEquals('some_realm', realm['name'])

        coll = _get_realm_coll()
        self.assertEquals(2, coll.count()) # One realm exists due to test base
Example #18
0
def _get_query_target(collection_name, query):
    """Gets out the targetted shard key from the query if there is one.
    Otherwise, returns None.
    """
    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    if shard_field in query and \
            isinstance(query[shard_field], (basestring, numbers.Integral)):
        return query[shard_field]
    return None
Example #19
0
def _get_query_target(collection_name, query):
    """Gets out the targetted shard key from the query if there is one.
    Otherwise, returns None.
    """
    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    if shard_field in query and _is_valid_type_for_sharding(
            query[shard_field]):
        return query[shard_field]
    return None
Example #20
0
def _get_collection_for_targetted_upsert(
        collection_name, query, update, with_options={}):
    shard_key = _get_query_target(collection_name, update)
    if not shard_key:
        shard_key = _get_query_target(collection_name, update['$set'])
    realm = _get_realm_for_collection(collection_name)
    location = _get_location_for_shard(realm, shard_key)

    cluster_name, database_name = parse_location(location.location)
    connection = get_connection(cluster_name)
    collection = connection[database_name][collection_name]
    if with_options:
        collection = collection.with_options(with_options)
    return collection
Example #21
0
def _should_pause_write(collection_name, query):
    realm = _get_realm_for_collection(collection_name)

    shard_key = _get_query_target(collection_name, query)
    if shard_key:
        meta = _get_metadata_for_shard(realm, shard_key)
        return \
            meta['status'] == ShardStatus.POST_MIGRATION_PAUSED_AT_DESTINATION
    else:
        paused_query = {
            'realm': realm['name'],
            'status': ShardStatus.POST_MIGRATION_PAUSED_AT_DESTINATION
        }
        shards_coll = _get_shards_coll()
        return shards_coll.find(paused_query).count() > 0
Example #22
0
def multishard_save(collection_name, doc, *args, **kwargs):
    _wait_for_pause_to_end(collection_name, doc)
    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']
    if shard_field not in doc:
        raise Exception(
            'Cannot save document without shard field (%s) present'
            % shard_field)

    # Inserts can use our generic collection iterator with a specific query
    # that is guaranteed to return exactly one collection.
    simple_query = {shard_field: doc[shard_field]}
    (collection, _), = _create_collection_iterator(collection_name, simple_query)

    return collection.save(doc, *args, **kwargs)
Example #23
0
def _should_pause_write(collection_name, query):
    realm = _get_realm_for_collection(collection_name)

    shard_key = _get_query_target(collection_name, query)
    if shard_key:
        meta = _get_metadata_for_shard(realm, shard_key)
        return \
            meta['status'] == ShardStatus.POST_MIGRATION_PAUSED_AT_DESTINATION
    else:
        paused_query = {
            'realm': realm['name'],
            'status': ShardStatus.POST_MIGRATION_PAUSED_AT_DESTINATION
        }
        shards_coll = _get_shards_coll()
        return shards_coll.find(paused_query).count() > 0
Example #24
0
def _get_oplog_pos(collection_name, shard_key):
    """Gets the oplog position for the given collection/shard key combination.
    This is necessary as the oplog will be very different on different clusters.
    """
    realm = metadata._get_realm_for_collection(collection_name)
    shard_metadata = _get_metadata_for_shard(realm['name'], shard_key)

    current_location = shard_metadata['location']
    current_collection = _get_collection_from_location_string(
        current_location, collection_name)
    current_conn = current_collection.database.client

    repl_coll = current_conn['local']['oplog.rs']
    most_recent_op = repl_coll.find({}, sort=[('$natural', -1)])[0]
    ts_from = most_recent_op['ts']
    return ts_from
Example #25
0
def multishard_find_and_modify(collection_name, query, update, **kwargs):
    _wait_for_pause_to_end(collection_name, query)

    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']
    if shard_field not in query:
        raise Exception(
            'Cannot perform find_and_modify without shard field (%s) present' %
            shard_field)

    # A find and modify only updates and returns one document. To make this
    # vaguely sane we enforce that this has to target a single shard and
    # so we make use of the targetted upsert infrastructure to support this.
    collection = _get_collection_for_targetted_upsert(collection_name, query,
                                                      {'$set': query})
    return collection.find_and_modify(query, update, **kwargs)
Example #26
0
def multishard_find_and_modify(collection_name, query, update, **kwargs):
    _wait_for_pause_to_end(collection_name, query)

    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']
    if shard_field not in query:
        raise Exception(
            'Cannot perform find_and_modify without shard field (%s) present'
            % shard_field)

    # A find and modify only updates and returns one document. To make this
    # vaguely sane we enforce that this has to target a single shard and
    # so we make use of the targetted upsert infrastructure to support this.
    collection = _get_collection_for_targetted_upsert(
            collection_name, query, {'$set': query})
    return collection.find_and_modify(query, update, **kwargs)
Example #27
0
def _get_collection_for_targetted_upsert(collection_name,
                                         query,
                                         update,
                                         with_options={}):
    shard_key = _get_query_target(collection_name, update)
    if not shard_key:
        shard_key = _get_query_target(collection_name, update['$set'])
    realm = _get_realm_for_collection(collection_name)
    location = _get_location_for_shard(realm, shard_key)

    cluster_name, database_name = parse_location(location.location)
    connection = get_connection(cluster_name)
    collection = connection[database_name][collection_name]
    if with_options:
        collection = collection.with_options(with_options)
    return collection
Example #28
0
def multishard_aggregate(collection_name, pipeline, *args, **kwargs):
    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']
    if '$match' not in pipeline[0]:
        raise Exception(
            'Sharded aggregation needs match in the first part of the pipeline')
    if shard_field not in pipeline[0]['$match']:
        raise Exception(
            'Cannot perform aggregation without shard field (%s) present'
            % shard_field)

    # To avoid aggregation needing to be recreated in this client we limit
    # aggregation to only one cluster.
    match_query = pipeline[0]['$match']
    (collection, _), = _create_collection_iterator(collection_name, match_query)

    return collection.aggregate(pipeline, *args, **kwargs)
Example #29
0
def _delete_source_data(collection_name, shard_key):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find(
        {'realm': realm['name'], 'shard_key': shard_key})
    if shard_metadata['status'] != metadata.ShardStatus.POST_MIGRATION_DELETE:
        raise Exception('Shard not in delete state')

    current_location = shard_metadata['location']
    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    cursor = current_collection.find({shard_field: shard_key}, {'_id': 1})
    for page in grouper(50, cursor):
        _ids = [doc['_id'] for doc in page]
        current_collection.remove({'_id': {'$in': _ids}})
Example #30
0
    def test_get_location_ordering(self):
        # Exposes a bug that was found in caching and default locations
        api.create_realm('dummy-realm', 'some_field', 'dummy_collection',
                         'cluster-1/some_db')
        api.set_shard_at_rest('dummy-realm', 1, 'dest2/some_db')
        realm = metadata._get_realm_for_collection('dummy_collection')
        meta = metadata._get_metadata_for_shard(realm, 2)
        expected_meta = {
            'status': metadata.ShardStatus.AT_REST,
            'realm': 'dummy-realm',
            'location': 'cluster-1/some_db'
        }
        self.assertEquals(meta, expected_meta)

        all_locations = metadata._get_all_locations_for_realm(realm)
        self.assertEquals([], all_locations['cluster-1/some_db'].contains)
        self.assertEquals([], all_locations['cluster-1/some_db'].excludes)
        self.assertEquals([1], all_locations['dest2/some_db'].contains)
        self.assertEquals([], all_locations['dest2/some_db'].excludes)
Example #31
0
def _sync_from_oplog(collection_name, shard_key, oplog_pos):
    """Syncs the oplog to within a reasonable timeframe of "now"."""
    realm = metadata._get_realm_for_collection(collection_name)
    shard_metadata = _get_metadata_for_shard(realm['name'], shard_key)

    source = _get_collection_from_location_string(shard_metadata['location'],
                                                  collection_name)
    target = _get_collection_from_location_string(
        shard_metadata['new_location'], collection_name)

    cursor = tail_oplog(source.database.client, oplog_pos)
    try:
        for entry in cursor:
            replay_oplog_entry(entry, {realm['shard_field']: shard_key},
                               source, target)
            oplog_pos = entry['ts']
    finally:
        cursor.close()
    return oplog_pos
Example #32
0
    def test_get_location_ordering(self):
        # Exposes a bug that was found in caching and default locations
        api.create_realm(
            'dummy-realm', 'some_field', 'dummy_collection',
            'cluster-1/some_db')
        api.set_shard_at_rest('dummy-realm', 1, 'dest2/some_db')
        realm = metadata._get_realm_for_collection('dummy_collection')
        meta = metadata._get_metadata_for_shard(realm, 2)
        expected_meta = {
            'status': metadata.ShardStatus.AT_REST,
            'realm': 'dummy-realm',
            'location': 'cluster-1/some_db'
        }
        self.assertEquals(meta, expected_meta)

        all_locations = metadata._get_all_locations_for_realm(realm)
        self.assertEquals([], all_locations['cluster-1/some_db'].contains)
        self.assertEquals([], all_locations['cluster-1/some_db'].excludes)
        self.assertEquals([1], all_locations['dest2/some_db'].contains)
        self.assertEquals([], all_locations['dest2/some_db'].excludes)
Example #33
0
def multishard_aggregate(
        collection_name, pipeline, with_options={}, *args, **kwargs):
    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']
    if '$match' not in pipeline[0]:
        raise Exception(
            'Sharded aggregation needs match in the first part of the pipeline')
    if shard_field not in pipeline[0]['$match']:
        raise Exception(
            'Cannot perform aggregation without shard field (%s) present'
            % shard_field)

    # To avoid aggregation needing to be recreated in this client we limit
    # aggregation to only one cluster.
    match_query = pipeline[0]['$match']
    (collection, _, _), = _create_collection_iterator(
        collection_name, match_query, with_options)

    # TODO: useCursor needs to be False until support for Mongo2.4 is removed
    return collection.aggregate(pipeline, useCursor=False, *args, **kwargs)
Example #34
0
def _delete_source_data(collection_name, shard_key):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find({
        'realm': realm['name'],
        'shard_key': shard_key
    })
    if shard_metadata['status'] != metadata.ShardStatus.POST_MIGRATION_DELETE:
        raise Exception('Shard not in delete state')

    current_location = shard_metadata['location']
    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    cursor = current_collection.find({shard_field: shard_key}, {'_id': 1})
    for page in grouper(50, cursor):
        _ids = [doc['_id'] for doc in page]
        current_collection.remove({'_id': {'$in': _ids}})
Example #35
0
def _do_copy(collection_name, shard_key, manager):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find({
        'realm': realm['name'],
        'shard_key': shard_key
    })
    if shard_metadata['status'] != metadata.ShardStatus.MIGRATING_COPY:
        raise Exception('Shard not in copy state (phase 1)')

    current_location = shard_metadata['location']
    new_location = shard_metadata['new_location']

    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    new_collection = _get_collection_from_location_string(
        new_location, collection_name)

    query = {shard_field: shard_key}
    cursor = current_collection.find(query, no_cursor_timeout=True)
    try:
        for record in cursor:
            new_collection.insert(record, w=0)

            # Get the insert throttle out of the manager. This allows for the
            # insert throttle to be changed by another thread whilst maintaining
            # thread safety.
            insert_throttle = manager.insert_throttle
            if insert_throttle:
                time.sleep(insert_throttle)

            manager.inc_inserted()
    finally:
        cursor.close()

    result = new_collection.database.command('getLastError')
    if result['err']:
        raise Exception('Failed to do copy! Mongo error: %s' % result['err'])
Example #36
0
def _do_copy(collection_name, shard_key, manager):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find(
        {'realm': realm['name'], 'shard_key': shard_key})
    if shard_metadata['status'] != metadata.ShardStatus.MIGRATING_COPY:
        raise Exception('Shard not in copy state (phase 1)')

    current_location = shard_metadata['location']
    new_location = shard_metadata['new_location']

    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    new_collection = _get_collection_from_location_string(
        new_location, collection_name)

    query = {shard_field: shard_key}
    cursor = current_collection.find(query, no_cursor_timeout=True)
    try:
        for record in cursor:
            new_collection.insert(record, w=0)

            # Get the insert throttle out of the manager. This allows for the
            # insert throttle to be changed by another thread whilst maintaining
            # thread safety.
            insert_throttle = manager.insert_throttle
            if insert_throttle:
                time.sleep(insert_throttle)

            manager.inc_inserted()
    finally:
        cursor.close()

    result = new_collection.database.command('getLastError')
    if result['err']:
        raise Exception('Failed to do copy! Mongo error: %s' % result['err'])
Example #37
0
def _do_copy(collection_name, shard_key, insert_throttle=None):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find({
        'realm': realm['name'],
        'shard_key': shard_key
    })
    if shard_metadata['status'] != metadata.ShardStatus.MIGRATING_COPY:
        raise Exception('Shard not in copy state (phase 1)')

    current_location = shard_metadata['location']
    new_location = shard_metadata['new_location']

    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    new_collection = _get_collection_from_location_string(
        new_location, collection_name)

    query = {shard_field: shard_key}
    cursor = current_collection.find(query, no_cursor_timeout=True)
    inserted = 0
    try:
        for record in cursor:
            new_collection.insert(record, w=0)
            if inserted % 50000 == 0:
                _detail_log('%d records inserted' % inserted)
            if insert_throttle:
                time.sleep(insert_throttle)
            inserted += 1
    finally:
        cursor.close()

    result = new_collection.database.command('getLastError')
    if result['err']:
        raise Exception('Failed to do copy! Mongo error: %s' % result['err'])
Example #38
0
def multishard_insert(collection_name,
                      doc_or_docs,
                      with_options={},
                      *args,
                      **kwargs):
    # TODO Remove this and use insert_one/insert_many to comply with new
    # pymongo deprecations
    is_multi_insert = isinstance(doc_or_docs, list)
    if not is_multi_insert:
        all_docs = [doc_or_docs]
    else:
        all_docs = doc_or_docs

    _wait_for_pause_to_end(collection_name, doc_or_docs)
    realm = _get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']
    for doc in all_docs:
        if shard_field not in doc:
            raise Exception(
                'Cannot insert document without shard field (%s) present' %
                shard_field)

    # Inserts can use our generic collection iterator with a specific query
    # that is guaranteed to return exactly one collection.
    # TODO This makes a multi-insert into lots of small inserts. This could be
    # optimised. For now, we'll see if this is OK.
    result = []
    for doc in all_docs:
        simple_query = {shard_field: doc[shard_field]}
        (collection, _,
         _), = _create_collection_iterator(collection_name, simple_query,
                                           with_options)
        result.append(collection.insert(doc, *args, **kwargs))
    if not is_multi_insert:
        return result[0]
    return result
Example #39
0
def _do_copy(collection_name, shard_key, insert_throttle=None):
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find(
        {'realm': realm['name'], 'shard_key': shard_key})
    if shard_metadata['status'] != metadata.ShardStatus.MIGRATING_COPY:
        raise Exception('Shard not in copy state (phase 1)')

    current_location = shard_metadata['location']
    new_location = shard_metadata['new_location']

    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    new_collection = _get_collection_from_location_string(
        new_location, collection_name)

    query = {shard_field: shard_key}
    cursor = current_collection.find(query, no_cursor_timeout=True)
    inserted = 0
    try:
        for record in cursor:
            new_collection.insert(record, w=0)
            if inserted % 50000 == 0:
                _detail_log('%d records inserted' % inserted)
            if insert_throttle:
                time.sleep(insert_throttle)
            inserted += 1
    finally:
        cursor.close()

    result = new_collection.database.command('getLastError')
    if result['err']:
        raise Exception('Failed to do copy! Mongo error: %s' % result['err'])
Example #40
0
def _sync_from_oplog(collection_name, shard_key, oplog_pos):
    """Syncs the oplog to within a reasonable timeframe of "now".
    """
    conn = get_controlling_db().connection
    repl_coll = conn['local']['oplog.rs']

    cursor = repl_coll.find({'ts': {'$gt': oplog_pos}}, tailable=True)
    cursor = cursor.add_option(_QUERY_OPTIONS['oplog_replay'])
    cursor = cursor.hint([('$natural', 1)])

    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find(
        {'realm': realm['name'], 'shard_key': shard_key})

    current_location = shard_metadata['location']
    new_location = shard_metadata['new_location']

    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    new_collection = _get_collection_from_location_string(
        new_location, collection_name)

    shard_query = {shard_field: shard_key}

    current_namespace = "%s.%s" % (
        current_collection.database.name, current_collection.name)

    for r in cursor:
        if r['ns'] != current_namespace:
            continue

        if r['op'] in ['u', 'i']:
            # Check that this doc is part of our query set
            oid = r.get('o2', r['o'])['_id']
            object_query = {'_id': oid}
            object_query.update(shard_query)
            match = bool(
                current_collection.find(object_query).count())
        elif r['op'] == 'd':
            oid = r.get('o2', r['o'])['_id']
            object_query = {'_id': oid}
            object_query.update(shard_query)
            match = bool(
                new_collection.find(object_query).count())

        else:
            print 'Ignoring op', r['op'], r
            continue

        if not match:
            continue

        if r['op'] == 'u':
            blue(' - Updating %s with %s' % (oid, r['o']))
            new_collection.update(
                {'_id': oid}, r['o'], safe=True)

        elif r['op'] == 'i':
            try:
                new_collection.insert(r['o'], safe=True)
            except pymongo.errors.DuplicateKeyError:
                pass
        elif r['op'] == 'd':
            blue(' - Removing %s' % oid)
            new_collection.remove({'_id': oid}, safe=True)

        oplog_pos = r['ts']

    return oplog_pos
Example #41
0
def _sync_from_oplog(collection_name, shard_key, oplog_pos):
    """Syncs the oplog to within a reasonable timeframe of "now".
    """
    conn = get_controlling_db().connection
    repl_coll = conn['local']['oplog.rs']

    cursor = repl_coll.find({'ts': {'$gt': oplog_pos}}, tailable=True)
    cursor = cursor.add_option(_QUERY_OPTIONS['oplog_replay'])
    cursor = cursor.hint([('$natural', 1)])

    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shards_coll = api._get_shards_coll()
    shard_metadata, = shards_coll.find({
        'realm': realm['name'],
        'shard_key': shard_key
    })

    current_location = shard_metadata['location']
    new_location = shard_metadata['new_location']

    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    new_collection = _get_collection_from_location_string(
        new_location, collection_name)

    shard_query = {shard_field: shard_key}

    current_namespace = "%s.%s" % (current_collection.database.name,
                                   current_collection.name)

    for r in cursor:
        if r['ns'] != current_namespace:
            continue

        if r['op'] in ['u', 'i']:
            # Check that this doc is part of our query set
            oid = r.get('o2', r['o'])['_id']
            object_query = {'_id': oid}
            object_query.update(shard_query)
            match = bool(current_collection.find(object_query).count())
        elif r['op'] == 'd':
            oid = r.get('o2', r['o'])['_id']
            object_query = {'_id': oid}
            object_query.update(shard_query)
            match = bool(new_collection.find(object_query).count())

        else:
            print 'Ignoring op', r['op'], r
            continue

        if not match:
            continue

        if r['op'] == 'u':
            blue(' - Updating %s with %s' % (oid, r['o']))
            new_collection.update({'_id': oid}, r['o'], safe=True)

        elif r['op'] == 'i':
            try:
                new_collection.insert(r['o'], safe=True)
            except pymongo.errors.DuplicateKeyError:
                pass
        elif r['op'] == 'd':
            blue(' - Removing %s' % oid)
            new_collection.remove({'_id': oid}, safe=True)

        oplog_pos = r['ts']

    return oplog_pos
Example #42
0
def _sync_from_oplog(collection_name, shard_key, oplog_pos):
    """Syncs the oplog to within a reasonable timeframe of "now".
    """
    realm = metadata._get_realm_for_collection(collection_name)
    shard_field = realm['shard_field']

    shard_metadata = _get_metadata_for_shard(realm['name'], shard_key)

    current_location = shard_metadata['location']
    new_location = shard_metadata['new_location']

    current_collection = _get_collection_from_location_string(
        current_location, collection_name)

    new_collection = _get_collection_from_location_string(
        new_location, collection_name)

    # Get the connection used by the source collection and use that for the
    # oplog tailing
    conn = current_collection.database.client
    repl_coll = conn['local']['oplog.rs']
    cursor = repl_coll.find(
        {'ts': {'$gte': oplog_pos}},
        cursor_type=pymongo.CursorType.TAILABLE,
        oplog_replay=True)
    cursor = cursor.hint([('$natural', 1)])

    shard_query = {shard_field: shard_key}

    current_namespace = "%s.%s" % (
        current_collection.database.name, current_collection.name)

    for r in cursor:
        if r['ns'] != current_namespace:
            continue

        if r['op'] in ['u', 'i']:
            # Check that this doc is part of our query set
            oid = r.get('o2', r['o'])['_id']
            object_query = {'_id': oid}
            object_query.update(shard_query)
            match = bool(
                current_collection.find(object_query).count())
        elif r['op'] == 'd':
            oid = r.get('o2', r['o'])['_id']
            object_query = {'_id': oid}
            object_query.update(shard_query)
            match = bool(
                new_collection.find(object_query).count())
        else:
            # Notification ops can be ignored.
            continue

        if not match:
            continue

        if r['op'] == 'u':
            # Verify that this object has been successfully copied from the old
            # collection before performing the update. If an object is moved in
            # the index during a migration then it *can* be missed and we pick
            # it up here instead.
            if not new_collection.find({'_id': oid}).count():
                doc = list(current_collection.find({'_id': oid}))
                if doc:
                    doc = doc[0]
                    new_collection.insert(doc, w=1)
            new_collection.update(
                {'_id': oid}, r['o'], w=1)

        elif r['op'] == 'i':
            try:
                new_collection.insert(r['o'], w=1)
            except pymongo.errors.DuplicateKeyError:
                pass
        elif r['op'] == 'd':
            new_collection.remove({'_id': oid}, w=1)

        oplog_pos = r['ts']

    return oplog_pos