Beispiel #1
0
def copy_collection(source, dest, state_path, percent):
    """
    Copies all documents from source to destination collection. Inserts documents in
    batches using insert workers, which are each run in their own greenlet. Ensures that
    the destination is empty before starting the copy.

    Does no safety checks -- this is up to the caller.

    @param source      dict of (host, port, db, collection) for the source
    @param dest        dict of (host, port, db, collection) for the destination
    @param state_path  path of state database
    @param percent     percentage of documents to copy
    """
    gevent.monkey.patch_socket()

    # open state database
    state_db = CopyStateDB(state_path)

    # connect to mongo
    source_client = utils.mongo_connect(source,
                                        ensure_direct=True,
                                        max_pool_size=30,
                                        read_preference=ReadPreference.SECONDARY,
                                        document_class=FasterOrderedDict)

    source_collection = source_client[source['db']][source['collection']]
    if source_client.is_mongos:
        raise Exception("for performance reasons, sources must be mongod instances; %s:%d is not",
                        source['host'], source['port'])

    dest_client = utils.mongo_connect(dest,
                                      max_pool_size=30,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[dest['db']][dest['collection']]

    # record timestamp of last oplog entry, so that we know where to start applying ops
    # later
    oplog_ts = utils.get_last_oplog_entry(source_client)['ts']
    state_db.update_oplog_ts(source, dest, oplog_ts)

    # for testing copying of indices quickly
    if percent == 0:
        log.info("skipping copy because of --percent 0 parameters")
        state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
        return

    stats = Stats()
    stats.total_docs = int(source_collection.count())
    if percent:
        # hack-ish but good enough for a testing-only feature
        stats.total_docs = int(stats.total_docs * (float(percent)/100.0))

    # get all _ids, which works around a mongo bug/feature that causes massive slowdowns
    # of long-running, large reads over time
    ids = []
    cursor = source_collection.find(
        projection={'_id':True},
        modifiers={'$snapshot':True}
    )
    cursor.batch_size(5000)
    insert_pool = Pool(INSERT_POOL_SIZE)
    stats_greenlet = gevent.spawn(_copy_stats_worker, stats)
    for doc in cursor:
        _id = doc['_id']

        if percent is not None and not utils.id_in_subset(_id, percent):
            continue

        # when we've gathered enough _ids, spawn a worker greenlet to batch copy the
        # documents corresponding to them
        ids.append(_id)
        if len(ids) % INSERT_SIZE == 0:
            outgoing_ids = ids
            ids = []
            insert_pool.spawn(_find_and_insert_batch_worker,
                              source_collection=source_collection,
                              dest_collection=dest_collection,
                              ids=outgoing_ids,
                              stats=stats)
        gevent.sleep()

    # insert last batch of documents
    if len(ids) > 0:        
        _find_and_insert_batch_worker(source_collection=source_collection,
                                      dest_collection=dest_collection,
                                      ids=ids,
                                      stats=stats)
        stats.log()

    # wait until all other outstanding inserts have finished
    insert_pool.join()
    stats_greenlet.kill()
    log.info("done with initial copy")

    state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
Beispiel #2
0
def copy_collection(source, dest, state_path, percent):
    """
    Copies all documents from source to destination collection. Inserts documents in
    batches using insert workers, which are each run in their own greenlet. Ensures that
    the destination is empty before starting the copy.

    Does no safety checks -- this is up to the caller.

    @param source      dict of (host, port, db, collection) for the source
    @param dest        dict of (host, port, db, collection) for the destination
    @param state_path  path of state database
    @param percent     percentage of documents to copy
    """
    gevent.monkey.patch_socket()

    # open state database
    state_db = CopyStateDB(state_path)

    # connect to mongo
    source_client = utils.mongo_connect(
        source['host'],
        source['port'],
        ensure_direct=True,
        max_pool_size=30,
        read_preference=ReadPreference.SECONDARY,
        document_class=FasterOrderedDict)

    source_collection = source_client[source['db']][source['collection']]
    if source_client.is_mongos:
        raise Exception(
            "for performance reasons, sources must be mongod instances; %s:%d is not",
            source['host'], source['port'])

    dest_client = utils.mongo_connect(dest['host'],
                                      dest['port'],
                                      max_pool_size=30,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[dest['db']][dest['collection']]

    # record timestamp of last oplog entry, so that we know where to start applying ops
    # later
    oplog_ts = utils.get_last_oplog_entry(source_client)['ts']
    state_db.update_oplog_ts(source, dest, oplog_ts)

    # for testing copying of indices quickly
    if percent == 0:
        log.info("skipping copy because of --percent 0 parameters")
        state_db.update_state(source, dest,
                              CopyStateDB.STATE_WAITING_FOR_INDICES)
        return

    stats = Stats()
    stats.total_docs = int(source_collection.count())
    if percent:
        # hack-ish but good enough for a testing-only feature
        stats.total_docs = int(stats.total_docs * (float(percent) / 100.0))

    # get all _ids, which works around a mongo bug/feature that causes massive slowdowns
    # of long-running, large reads over time
    ids = []
    cursor = source_collection.find(fields=["_id"],
                                    snapshot=True,
                                    timeout=False)
    cursor.batch_size(5000)
    insert_pool = Pool(INSERT_POOL_SIZE)
    stats_greenlet = gevent.spawn(_copy_stats_worker, stats)
    for doc in cursor:
        _id = doc['_id']

        if percent is not None and not utils.id_in_subset(_id, percent):
            continue

        # when we've gathered enough _ids, spawn a worker greenlet to batch copy the
        # documents corresponding to them
        ids.append(_id)
        if len(ids) % INSERT_SIZE == 0:
            outgoing_ids = ids
            ids = []
            insert_pool.spawn(_find_and_insert_batch_worker,
                              source_collection=source_collection,
                              dest_collection=dest_collection,
                              ids=outgoing_ids,
                              stats=stats)
        gevent.sleep()

    # insert last batch of documents
    if len(ids) > 0:
        _find_and_insert_batch_worker(source_collection=source_collection,
                                      dest_collection=dest_collection,
                                      ids=ids,
                                      stats=stats)
        stats.log()

    # wait until all other outstanding inserts have finished
    insert_pool.join()
    stats_greenlet.kill()
    log.info("done with initial copy")

    state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
Beispiel #3
0
def copy_collection(manifest, state_path, percent):
    """
    Copies all documents from source to destination collection. Inserts documents in
    batches using insert workers, which are each run in their own greenlet.

    Does no safety checks -- this is up to the caller.

    @param manifest    dict of (srchost, srcport, srcuser, srcpwd, srcdb, srccol,
                                desthost, destport, destuser, destpwd, destdb, destcol)
    @param state_path  path of state database
    @param percent     percentage of documents to copy
    """
    gevent.monkey.patch_socket()

    # open state database
    state_db = CopyStateDB(state_path)

    # connect to mongo
    source_client = utils.mongo_connect(
        manifest['srchost'],
        manifest['srcport'],
        manifest['srcuser'],
        manifest['srcpwd'],
        maxPoolSize=30,
        read_preference=ReadPreference.SECONDARY,
        document_class=FasterOrderedDict)

    source_collection = source_client[manifest['srcdb']][manifest['srccol']]
    if source_client.is_mongos:
        raise Exception(
            "for performance reasons, sources must be mongod instances; %s:%d is not",
            manifest['srchost'], source['srcport'])

    dest_client = utils.mongo_connect(manifest['desthost'],
                                      manifest['destport'],
                                      manifest['destuser'],
                                      manifest['destpwd'],
                                      maxPoolSize=30,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[manifest['destdb']][manifest['destcol']]

    # for testing copying of indices quickly
    if percent == 0:
        log.info("skipping copy because of --percent 0 parameters")
        state_db.update_state(manifest, CopyStateDB.STATE_APPLYING_OPLOG)
        return

    stats = Stats()
    stats.total_docs = int(source_collection.count(filter=manifest["query"]))
    if percent:
        # hack-ish but good enough for a testing-only feature
        stats.total_docs = int(stats.total_docs * (float(percent) / 100.0))

    # get all _ids, which works around a mongo bug/feature that causes massive slowdowns
    # of long-running, large reads over time
    ids = []
    cursor = source_collection.find(filter=manifest["query"],
                                    projection={"_id": True},
                                    no_cursor_timeout=False)
    cursor.batch_size(5000)
    insert_pool = Pool(INSERT_POOL_SIZE)
    stats_greenlet = gevent.spawn(_copy_stats_worker, stats)
    for doc in cursor:
        _id = doc['_id']

        if percent is not None and not utils.id_in_subset(_id, percent):
            continue

        # when we've gathered enough _ids, spawn a worker greenlet to batch copy the
        # documents corresponding to them
        ids.append(_id)
        if len(ids) % INSERT_SIZE == 0:
            outgoing_ids = ids
            ids = []
            insert_pool.spawn(_find_and_insert_batch_worker,
                              source_collection=source_collection,
                              dest_collection=dest_collection,
                              ids=outgoing_ids,
                              stats=stats)
        gevent.sleep()

    # insert last batch of documents
    if len(ids) > 0:
        _find_and_insert_batch_worker(source_collection=source_collection,
                                      dest_collection=dest_collection,
                                      ids=ids,
                                      stats=stats)
        stats.log()

    # wait until all other outstanding inserts have finished
    insert_pool.join()
    stats_greenlet.kill()

    srccount = stats.total_docs
    destcount = dest_collection.count(filter=manifest["query"])
    if srccount == destcount:
        log.info("COPY SUCCEED. srccount(%d) == destcount(%d)" %
                 (srccount, destcount))
    else:
        log.error("COPY FAILED. srccount(%d) != destcount(%d)" %
                  (srccount, destcount))

    state_db.update_state(manifest, CopyStateDB.STATE_APPLYING_OPLOG)