Example #1
0
def copy_indexes(source, dest):
    """
    Copies all indexes from source to destination, preserving options such as unique
    and sparse.
    """
    # connect to mongo instances
    source_client = utils.mongo_connect(source['host'], source['port'],
                                        ensure_direct=True,
                                        max_pool_size=1,
                                        read_preference=ReadPreference.SECONDARY)
    source_collection = source_client[source['db']][source['collection']]

    dest_client = utils.mongo_connect(dest['host'], dest['port'], max_pool_size=1)
    dest_collection = dest_client[dest['db']][dest['collection']] 

    # copy indices
    for name, index in source_collection.index_information().items():
        kwargs = { 'name': name }
        index_key = None
        for k, v in index.items():
            if k in ['unique', 'sparse']:
                kwargs[k] = v
            elif k == 'v':
                continue
            elif k == 'key':
                # sometimes, pymongo will give us floating point numbers, so let's make sure
                # they're ints instead
                index_key = [(field, int(direction)) for (field, direction) in v]
            else:
                raise NotImplementedError("don't know how to handle index info key %s" % k)
            # TODO: there are other index options that probably aren't handled here

        assert index_key is not None
        log.info("ensuring index on %s (options = %s)", index_key, kwargs)
        dest_collection.ensure_index(index_key, **kwargs)
Example #2
0
def ensure_empty_dest(dest):
    client = utils.mongo_connect(
        dest["host"], dest["port"], ensure_direct=True, max_pool_size=1, read_preference=ReadPreference.PRIMARY
    )
    collection = client[dest["db"]][dest["collection"]]
    if collection.count() > 0:
        die("destination must be empty!")
Example #3
0
def ensure_empty_dest(dest):
    client = utils.mongo_connect(dest,
                                 ensure_direct=True,
                                 read_preference=ReadPreference.PRIMARY)
    collection = client[dest['db']][dest['collection']]
    if collection.count() > 0:
        die("destination must be empty!")
Example #4
0
def ensure_empty_dest(dest):
    client = utils.mongo_connect(dest,
                                 ensure_direct=True,
                                 read_preference=ReadPreference.PRIMARY)
    collection = client[dest['db']][dest['collection']]
    if collection.count() > 0:
        die("destination must be empty!")
Example #5
0
def copy_indexes(source, dest):
    """
    Copies all indexes from source to destination, preserving options such as unique
    and sparse.
    """
    # connect to mongo instances
    source_client = utils.mongo_connect(source,
                                        ensure_direct=True,
                                        max_pool_size=1,
                                        read_preference=ReadPreference.SECONDARY)
    source_collection = source_client[source['db']][source['collection']]

    dest_client = utils.mongo_connect(dest, max_pool_size=1)
    dest_collection = dest_client[dest['db']][dest['collection']] 

    # copy indices
    for name, index in source_collection.index_information().items():
        kwargs = { 'name': name }
        index_key = None
        #print index
        for k, v in index.items():
            #print "%s\t%s\t%s\t%s" % (name, index, k, v)
            if k in ['unique', 'sparse', 'background', 'safe', 'default_language', 'weights', 'language_override', 'textIndexVersion']:
                kwargs[k] = v
            elif k == 'v':
                continue
            elif k == 'key':
                # sometimes, pymongo will give us floating point numbers, so let's make sure
                # they're ints instead
                #print "\n\n"
                #print k
                #print v
                index_key = []
                for field, direction in v:
                    try:
                        index_key.append((field, int(direction)))
                    except ValueError as e:
                        index_key.append((field, direction))
            else:
                print "\n\n"
                print "%s\t%s\t%s\t%s" % (name, index, k, v)
                raise NotImplementedError("don't know how to handle index info key %s" % k)
            # TODO: there are other index options that probably aren't handled here

        assert index_key is not None
        log.info("ensuring index on %s (options = %s)", index_key, kwargs)
        dest_collection.ensure_index(index_key, **kwargs)
Example #6
0
def copy_indexes(source, dest):
    """
    Copies all indexes from source to destination, preserving options such as unique
    and sparse.
    """
    # connect to mongo instances
    source_client = utils.mongo_connect(
        source,
        ensure_direct=True,
        max_pool_size=1,
        read_preference=ReadPreference.SECONDARY)
    source_collection = source_client[source['db']][source['collection']]

    dest_client = utils.mongo_connect(dest, max_pool_size=1)
    dest_collection = dest_client[dest['db']][dest['collection']]

    # copy indices
    for name, index in source_collection.index_information().items():
        kwargs = {'name': name}
        index_key = None
        for k, v in index.items():
            if k in [
                    'unique', 'sparse', 'background', 'safe',
                    'default_language', 'weights', 'language_override',
                    'textIndexVersion', 'ns'
            ]:
                kwargs[k] = v
            elif k == 'v':
                continue
            elif k == 'key':
                index_key = []
                for field, direction in v:
                    try:
                        index_key.append((field, int(direction)))
                    except ValueError as e:
                        index_key.append((field, direction))
            else:
                print "\n\n"
                print "%s\t%s\t%s\t%s" % (name, index, k, v)
                raise NotImplementedError(
                    "don't know how to handle index info key %s" % k)
            # TODO: there are other index options that probably aren't handled here

        assert index_key is not None
        log.info("ensuring index on %s (options = %s)", index_key, kwargs)
        dest_collection.ensure_index(index_key, **kwargs)
Example #7
0
def update_ttls(source, state_path, seconds):

    gevent.monkey.patch_socket()

    source_client = utils.mongo_connect(
        source['host'],
        source['port'],
        ensure_direct=True,
        max_pool_size=30,
        read_preference=ReadPreference.SECONDARY,
        document_class=FasterOrderedDict)

    source_collection = source_client[source['db']][source['collection']]
    if source_client.is_mongos:
        raise Exception(
            "for performance reasons, sources must be mongod instances; %s:%d is not",
            source['host'], source['port'])

    if seconds < 0:
        log.info("Skipping update, TTL less than 0")
        return

    stats = Stats()
    stats.total_docs = int(source_collection.count())

    ids = []
    cursor = source_collection.find(fields=["_id"],
                                    snapshot=True,
                                    timeout=False)
    cursor.batch_size(5000)
    insert_pool = Pool(40)
    stats_greenlet = gevent.spawn(_ttl_stats_worker, stats)

    for doc in cursor:
        _id = doc["_id"]

        ids.append(_id)
        if len(ids) % 250 == 0:
            outgoing_ids = ids
            ids = []
            insert_pool.spawn(adjust_ttl_batch_worker,
                              source_collection=source_collection,
                              seconds=seconds,
                              ids=outgoing_ids,
                              stats=stats)

        gevent.sleep()

    if len(ids) > 0:
        adjust_ttl_batch_worker(source_collection=source_collection,
                                seconds=seconds,
                                ids=ids,
                                stats=stats)

    insert_pool.join()
    stats.log()
    stats_greenlet.kill()
    log.info("Finished TTL adjust")
Example #8
0
def copy_indexes(source, dest):
    """
    Copies all indexes from source to destination, preserving options such as unique
    and sparse.
    """
    # connect to mongo instances
    source_client = utils.mongo_connect(
        source['host'],
        source['port'],
        ensure_direct=True,
        max_pool_size=1,
        read_preference=ReadPreference.SECONDARY)
    source_collection = source_client[source['db']][source['collection']]

    dest_client = utils.mongo_connect(dest['host'],
                                      dest['port'],
                                      max_pool_size=1)
    dest_collection = dest_client[dest['db']][dest['collection']]

    # copy indices
    for name, index in source_collection.index_information().items():
        kwargs = {'name': name}
        index_key = None
        for k, v in index.items():
            if k in ['unique', 'sparse']:
                kwargs[k] = v
            elif k == 'v':
                continue
            elif k == 'key':
                # sometimes, pymongo will give us floating point numbers, so let's make sure
                # they're ints instead
                index_key = [(field, int(direction))
                             for (field, direction) in v]
            else:
                raise NotImplementedError(
                    "don't know how to handle index info key %s" % k)
            # TODO: there are other index options that probably aren't handled here

        assert index_key is not None
        log.info("ensuring index on %s (options = %s)", index_key, kwargs)
        dest_collection.ensure_index(index_key, **kwargs)
Example #9
0
def copy_collection(manifest, state_path, percent):
    """
    Copies all documents from source to destination collection. Inserts documents in
    batches using insert workers, which are each run in their own greenlet.

    Does no safety checks -- this is up to the caller.

    @param manifest    dict of (srchost, srcport, srcuser, srcpwd, srcdb, srccol,
                                desthost, destport, destuser, destpwd, destdb, destcol)
    @param state_path  path of state database
    @param percent     percentage of documents to copy
    """
    gevent.monkey.patch_socket()

    # open state database
    state_db = CopyStateDB(state_path)

    # connect to mongo
    source_client = utils.mongo_connect(
        manifest['srchost'],
        manifest['srcport'],
        manifest['srcuser'],
        manifest['srcpwd'],
        maxPoolSize=30,
        read_preference=ReadPreference.SECONDARY,
        document_class=FasterOrderedDict)

    source_collection = source_client[manifest['srcdb']][manifest['srccol']]
    if source_client.is_mongos:
        raise Exception(
            "for performance reasons, sources must be mongod instances; %s:%d is not",
            manifest['srchost'], source['srcport'])

    dest_client = utils.mongo_connect(manifest['desthost'],
                                      manifest['destport'],
                                      manifest['destuser'],
                                      manifest['destpwd'],
                                      maxPoolSize=30,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[manifest['destdb']][manifest['destcol']]

    # for testing copying of indices quickly
    if percent == 0:
        log.info("skipping copy because of --percent 0 parameters")
        state_db.update_state(manifest, CopyStateDB.STATE_APPLYING_OPLOG)
        return

    stats = Stats()
    stats.total_docs = int(source_collection.count(filter=manifest["query"]))
    if percent:
        # hack-ish but good enough for a testing-only feature
        stats.total_docs = int(stats.total_docs * (float(percent) / 100.0))

    # get all _ids, which works around a mongo bug/feature that causes massive slowdowns
    # of long-running, large reads over time
    ids = []
    cursor = source_collection.find(filter=manifest["query"],
                                    projection={"_id": True},
                                    no_cursor_timeout=False)
    cursor.batch_size(5000)
    insert_pool = Pool(INSERT_POOL_SIZE)
    stats_greenlet = gevent.spawn(_copy_stats_worker, stats)
    for doc in cursor:
        _id = doc['_id']

        if percent is not None and not utils.id_in_subset(_id, percent):
            continue

        # when we've gathered enough _ids, spawn a worker greenlet to batch copy the
        # documents corresponding to them
        ids.append(_id)
        if len(ids) % INSERT_SIZE == 0:
            outgoing_ids = ids
            ids = []
            insert_pool.spawn(_find_and_insert_batch_worker,
                              source_collection=source_collection,
                              dest_collection=dest_collection,
                              ids=outgoing_ids,
                              stats=stats)
        gevent.sleep()

    # insert last batch of documents
    if len(ids) > 0:
        _find_and_insert_batch_worker(source_collection=source_collection,
                                      dest_collection=dest_collection,
                                      ids=ids,
                                      stats=stats)
        stats.log()

    # wait until all other outstanding inserts have finished
    insert_pool.join()
    stats_greenlet.kill()

    srccount = stats.total_docs
    destcount = dest_collection.count(filter=manifest["query"])
    if srccount == destcount:
        log.info("COPY SUCCEED. srccount(%d) == destcount(%d)" %
                 (srccount, destcount))
    else:
        log.error("COPY FAILED. srccount(%d) != destcount(%d)" %
                  (srccount, destcount))

    state_db.update_state(manifest, CopyStateDB.STATE_APPLYING_OPLOG)
Example #10
0
def copy_collection(source, dest, state_path, percent):
    """
    Copies all documents from source to destination collection. Inserts documents in
    batches using insert workers, which are each run in their own greenlet. Ensures that
    the destination is empty before starting the copy.

    Does no safety checks -- this is up to the caller.

    @param source      dict of (host, port, db, collection) for the source
    @param dest        dict of (host, port, db, collection) for the destination
    @param state_path  path of state database
    @param percent     percentage of documents to copy
    """
    gevent.monkey.patch_socket()

    # open state database
    state_db = CopyStateDB(state_path)

    # connect to mongo
    source_client = utils.mongo_connect(source,
                                        ensure_direct=True,
                                        max_pool_size=30,
                                        read_preference=ReadPreference.SECONDARY,
                                        document_class=FasterOrderedDict)

    source_collection = source_client[source['db']][source['collection']]
    if source_client.is_mongos:
        raise Exception("for performance reasons, sources must be mongod instances; %s:%d is not",
                        source['host'], source['port'])

    dest_client = utils.mongo_connect(dest,
                                      max_pool_size=30,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[dest['db']][dest['collection']]

    # record timestamp of last oplog entry, so that we know where to start applying ops
    # later
    oplog_ts = utils.get_last_oplog_entry(source_client)['ts']
    state_db.update_oplog_ts(source, dest, oplog_ts)

    # for testing copying of indices quickly
    if percent == 0:
        log.info("skipping copy because of --percent 0 parameters")
        state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
        return

    stats = Stats()
    stats.total_docs = int(source_collection.count())
    if percent:
        # hack-ish but good enough for a testing-only feature
        stats.total_docs = int(stats.total_docs * (float(percent)/100.0))

    # get all _ids, which works around a mongo bug/feature that causes massive slowdowns
    # of long-running, large reads over time
    ids = []
    cursor = source_collection.find(
        projection={'_id':True},
        modifiers={'$snapshot':True}
    )
    cursor.batch_size(5000)
    insert_pool = Pool(INSERT_POOL_SIZE)
    stats_greenlet = gevent.spawn(_copy_stats_worker, stats)
    for doc in cursor:
        _id = doc['_id']

        if percent is not None and not utils.id_in_subset(_id, percent):
            continue

        # when we've gathered enough _ids, spawn a worker greenlet to batch copy the
        # documents corresponding to them
        ids.append(_id)
        if len(ids) % INSERT_SIZE == 0:
            outgoing_ids = ids
            ids = []
            insert_pool.spawn(_find_and_insert_batch_worker,
                              source_collection=source_collection,
                              dest_collection=dest_collection,
                              ids=outgoing_ids,
                              stats=stats)
        gevent.sleep()

    # insert last batch of documents
    if len(ids) > 0:        
        _find_and_insert_batch_worker(source_collection=source_collection,
                                      dest_collection=dest_collection,
                                      ids=ids,
                                      stats=stats)
        stats.log()

    # wait until all other outstanding inserts have finished
    insert_pool.join()
    stats_greenlet.kill()
    log.info("done with initial copy")

    state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
Example #11
0
    parser.add_argument(
        '--source', type=str, required=True, metavar='URL',
        help='source to read from; e.g. localhost:27017/prod_maestro.emails')
    parser.add_argument(
        '--dest', type=str, required=True, metavar='URL',
        help='destination to copy to; e.g. localhost:27017/destination_db.emails')
    parser.add_argument(
        '--mismatches-file', type=str, default=None, required=True, metavar='FILENAME',
        help='read ids to copy from this file, which is generated by compare_collections.py')
    args = parser.parse_args()

    # connect to source and destination
    source = utils.parse_mongo_url(args.source)
    source_client = utils.mongo_connect(source,
                                        ensure_direct=True,
                                        max_pool_size=POOL_SIZE,
                                        read_preference=ReadPreference.SECONDARY_PREFERRED,
                                        document_class=FasterOrderedDict)
    source_collection = source_client[source['db']][source['collection']]
    if not source_client.is_mongos or source_client.is_primary:
        raise Exception("source must be a mongos instance or a primary")


    dest = utils.parse_mongo_url(args.dest)
    dest_client = utils.mongo_connect(dest,
                                      max_pool_size=POOL_SIZE,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[dest['db']][dest['collection']]

    if source == dest:
        raise ValueError("source and destination cannot be the same!")
Example #12
0
def compare_collections(source, dest, percent, error_bp, recent_ops, ids_file):
    """
    compares two collections, using retries to see if collections are eventually consistent

    @param source_collection   source for data
    @param dest_collection     copied data to verify
    @param percent             percentage of documents to verify
    @param ids_file            files containing querie
    """
    MismatchLogger.collection_name = source['collection']

    # setup client connections
    source_client = utils.mongo_connect(source,
                                        ensure_direct=True,
                                        maxPoolSize=POOL_SIZE,
                                        slave_okay=True,
                                        document_class=dict)
    source_collection = source_client[source['db']][source['collection']]

    dest_client = utils.mongo_connect(dest,
                                      ensure_direct=True,
                                      maxPoolSize=POOL_SIZE,
                                      slave_okay=True,
                                      document_class=dict)

    dest_collection = dest_client[dest['db']][dest['collection']]

    # setup stats
    stats = CompareStats()
    compare_pool = gevent.pool.Pool(POOL_SIZE)
    retry_pool = gevent.pool.Pool(POOL_SIZE * 5)

    # get just _id's first, because long-running queries degrade significantly
    # over time; reading just _ids is fast enough (or small enough?) not to suffer
    # from this degradation
    if recent_ops:
        id_getter = _get_ids_for_recent_ops(source_client, recent_ops)
        stats.total_docs = recent_ops
        if source_client.is_mongos:
            log.error(
                "cannot read oplogs through mongos; specify mongod instances instead"
            )
            return
    elif ids_file:
        id_getter = _get_ids_in_file(ids_file)
        stats.total_docs = id_getter.next()
    else:
        id_getter = _get_all_ids(source_collection)
        stats.total_docs = source_collection.count()

    if percent is not None:
        stats.total_docs = int(float(stats.total_docs) * percent / 100.0)

    stats_greenlet = gevent.spawn(_stats_worker, stats)

    # read documents in batches, but perform retries individually in separate greenlets
    _ids = []
    for _id in id_getter:
        if percent is not None and not utils.id_in_subset(_id, percent):
            continue

        _ids.append(_id)
        if len(_ids) == READ_SIZE:
            _ids_to_compare = _ids
            _ids = []
            compare_pool.spawn(_compare_ids_worker,
                               _ids=_ids_to_compare,
                               source_collection=source_collection,
                               dest_collection=dest_collection,
                               stats=stats,
                               retry_pool=retry_pool)

    # compare final batch of _id's
    if _ids:
        compare_pool.spawn(_compare_ids_worker,
                           _ids=_ids,
                           source_collection=source_collection,
                           dest_collection=dest_collection,
                           stats=stats,
                           retry_pool=retry_pool)

    # wait for all greenlets to finish
    compare_pool.join()
    retry_pool.join()
    stats_greenlet.kill()
    stats.log()
    log.info("compare finished")
Example #13
0
    parser.add_argument(
        '--source', type=str, required=True, metavar='URL',
        help='source to read from; e.g. localhost:27017/prod_maestro.emails')
    parser.add_argument(
        '--dest', type=str, required=True, metavar='URL',
        help='destination to copy to; e.g. localhost:27017/destination_db.emails')
    parser.add_argument(
        '--mismatches-file', type=str, default=None, required=True, metavar='FILENAME',
        help='read ids to copy from this file, which is generated by compare_collections.py')
    args = parser.parse_args()

    # connect to source and destination
    source = utils.parse_mongo_url(args.source)
    source_client = utils.mongo_connect(source['host'], source['port'],
                                        ensure_direct=True,
                                        max_pool_size=POOL_SIZE,
                                        read_preference=ReadPreference.SECONDARY_PREFERRED,
                                        document_class=FasterOrderedDict)
    source_collection = source_client[source['db']][source['collection']]
    if not source_client.is_mongos or source_client.is_primary:
        raise Exception("source must be a mongos instance or a primary")


    dest = utils.parse_mongo_url(args.dest)
    dest_client = utils.mongo_connect(dest['host'], dest['port'],
                                      max_pool_size=POOL_SIZE,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[dest['db']][dest['collection']]

    if source == dest:
        raise ValueError("source and destination cannot be the same!")
Example #14
0
def copy_indexes(manifests, drop):
    """
    Copies all indexes from source to destination, preserving options such as unique
    and sparse.

    @param manifest    dict of (srchost, srcport, srcuser, srcpwd, srcdb, srccol,
                                desthost, destport, destuser, destpwd, destdb, destcol)
    @param drop        drop destination collection before create index
    """
    for manifest in manifests:
        # connect to mongo instances
        source_client = utils.mongo_connect(
            manifest['srchost'],
            manifest['srcport'],
            manifest['srcuser'],
            manifest['srcpwd'],
            maxPoolSize=1,
            read_preference=ReadPreference.SECONDARY)
        source_collection = source_client[manifest['srcdb']][
            manifest['srccol']]

        dest_client = utils.mongo_connect(manifest['desthost'],
                                          manifest['destport'],
                                          manifest['destuser'],
                                          manifest['destpwd'],
                                          maxPoolSize=1)
        dest_collection = dest_client[manifest['destdb']][manifest['destcol']]

        if dest_collection.count(filter=manifest["query"]) > 0:
            if drop:
                if manifest["query"] == {}:
                    log.info("drop destination collection: %s.%s" %
                             (manifest['destdb'], manifest['destcol']))
                    dest_collection.drop()
                else:
                    log.info("delete destination collection: %s.%s [%s]" %
                             (manifest['destdb'], manifest['destcol'],
                              manifest['query']))
                    dest_collection.delete_many(filter=manifest["query"])
            else:
                log.warn("destination collection is not empty: %s.%s" %
                         (manifest['destdb'], manifest['destcol']))

        # copy indices
        for name, index in source_collection.index_information().items():
            kwargs = {'name': name}
            index_key = None
            for k, v in index.items():
                if k in ['unique', 'sparse']:
                    kwargs[k] = v
                elif k in ['v', 'ns', 'background']:
                    continue
                elif k == 'key':
                    # sometimes, pymongo will give us floating point numbers, so let's make sure
                    # they're ints instead
                    index_key = [(field, int(direction))
                                 for (field, direction) in v]
                else:
                    raise NotImplementedError(
                        "don't know how to handle index info key %s" % k)
                # TODO: there are other index options that probably aren't handled here

            assert index_key is not None
            log.info("ensuring index on %s (options = %s)", index_key, kwargs)
            dest_collection.create_index(index_key, **kwargs)
Example #15
0
def apply_oplog(source, dest, percent, state_path):
    """
    Applies oplog entries from source to destination. Since the oplog storage format
    has known and possibly unknown idiosyncracies, we take a conservative approach. For
    each insert or delete op, we can easily replay those. For updates, we do the following:

    1. Note the _id of the updated document
    2. Retrieved the updated document from the source
    3. Upsert the updated document in the destination

    @param oplog              oplog collection from the source mongod instance
    @param start_ts           timestamp at which we should start replaying oplog entries
    @param source_collection  collection we're reading from
    @param dest_collection    collection we're writing to
    @param checkpoint_ts_func function that, when called, persists oplog timestamp to disk
    @param 
    """
    gevent.monkey.patch_socket()

    stats = ApplyStats()
    apply_workers = Pool(20) 

    # connect to state db
    state_db = CopyStateDB(state_path)

    # connect to mongo
    source_client = utils.mongo_connect(source,
                                        ensure_direct=True,
                                        max_pool_size=30,
                                        read_preference=ReadPreference.SECONDARY,
                                        document_class=FasterOrderedDict)
    source_collection = source_client[source['db']][source['collection']]

    dest_client = utils.mongo_connect(dest,
                                      max_pool_size=30,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[dest['db']][dest['collection']] 
    oplog = source_client['local']['oplog.rs']

    # print stats periodically
    stats.paused = True
    stats_greenlet = gevent.spawn(oplog_stats_worker, stats)

    # checkpoint oplog position to disk periodically
    checkpoint_greenlet = gevent.spawn(oplog_checkpoint_worker, stats, source, dest, state_db)

    # figure out where we need to start reading oplog entries; rewind our oplog timestamp
    # a bit, to avoid issues with the user pressing Control-C while some ops are pending
    #
    # this works, because oplog entries are idempotent
    start_ts_orig = state_db.get_oplog_ts(source, dest)
    start_ts = bson.Timestamp(time=start_ts_orig.time-TS_REWIND, inc=0)
    log.info("starting apply at %s", start_ts)

    # perform tailing oplog query using the oplog_replay option to efficiently find
    # our starting position in the oplog
    query = {}
    query['ts'] = {'$gte': start_ts}
    query['ns'] = source_collection.full_name 
    cursor = oplog.find(
        query,
        cursor_type=CursorType.TAILABLE_AWAIT,
    )
    cursor.add_option(pymongo.cursor._QUERY_OPTIONS['oplog_replay'])
    while True:
        for op in cursor:
            stats.paused = False

            _id = _op_id(op)
            if percent and not utils.id_in_subset(_id, percent):
                continue

            stats.ops_retrieved += 1

            # block *all* further ops from being applied if there's a pending
            # op on the current _id, to ensure serialization
            while _id in stats.pending_ids:
                gevent.sleep(0.1)
                stats.sleeps += 1

            # do the real oplog work in a greenlet from the pool
            stats.pending_ids.add(_id)
            apply_workers.spawn(_apply_op_worker,
                                op,
                                source_collection,
                                dest_collection,
                                stats)

            # update our last timestamp; this is *not* guaranteed to be the timestamp of the
            # most recent op, which is impossible because of our out-of-order execution
            #
            # this is an approximation that needs to be accurate to within TS_REWIND seconds
            stats.last_ts = op['ts']

        # while we have a tailable cursor, it can stop iteration if no more results come back
        # in a reasonable time, so sleep for a bit then try to continue iteration
        if cursor.alive:
            log.debug("replayed all oplog entries; sleeping...")
            stats.paused = True
            gevent.sleep(2)
            stats.paused = False
        else:
            log.error("cursor died on us!")
            break

    # just to silence pyflakes...
    stats_greenlet.kill()
    checkpoint_greenlet.kill()
Example #16
0

if __name__ == '__main__':

    if not len(sys.argv) > 2:
        print(
            "enter a pair of cryptocurrencies separated by a space, e.g. 'BTC ETH'"
        )
        exit(1)

    # get the currency pair from the command line
    cur1 = sys.argv[1]
    cur2 = sys.argv[2]

    # create a connection to mongo to persist time series data
    db = mongo_connect(config.MONGO_HOST) if config.PERSIST else None
    jobs = []
    # for each exchange, create
    for ex in config.exchanges:
        # if bypass is set to true, bypass this exchange
        if ex["bypass"]:
            continue
        name = ex["name"]
        handler = ex["handler"]
        url = ex["url"]

        p = Process(target=moving_average,
                    args=(db, cur1, cur2, name, url, handler))
        p.daemon = True
        jobs.append(p)
        p.start()
Example #17
0
def apply_oplog(source, dest, percent, state_path):
    """
    Applies oplog entries from source to destination. Since the oplog storage format
    has known and possibly unknown idiosyncracies, we take a conservative approach. For
    each insert or delete op, we can easily replay those. For updates, we do the following:

    1. Note the _id of the updated document
    2. Retrieved the updated document from the source
    3. Upsert the updated document in the destination

    @param oplog              oplog collection from the source mongod instance
    @param start_ts           timestamp at which we should start replaying oplog entries
    @param source_collection  collection we're reading from
    @param dest_collection    collection we're writing to
    @param checkpoint_ts_func function that, when called, persists oplog timestamp to disk
    @param 
    """
    gevent.monkey.patch_socket()

    stats = ApplyStats()
    apply_workers = Pool(20)

    # connect to state db
    state_db = CopyStateDB(state_path)

    # connect to mongo
    source_client = utils.mongo_connect(
        source,
        ensure_direct=True,
        max_pool_size=30,
        read_preference=ReadPreference.SECONDARY,
        document_class=FasterOrderedDict,
    )
    source_collection = source_client[source["db"]][source["collection"]]

    dest_client = utils.mongo_connect(dest, max_pool_size=30, document_class=FasterOrderedDict)
    dest_collection = dest_client[dest["db"]][dest["collection"]]
    oplog = source_client["local"]["oplog.rs"]

    # print stats periodically
    stats.paused = True
    stats_greenlet = gevent.spawn(oplog_stats_worker, stats)

    # checkpoint oplog position to disk periodically
    checkpoint_greenlet = gevent.spawn(oplog_checkpoint_worker, stats, source, dest, state_db)

    # figure out where we need to start reading oplog entries; rewind our oplog timestamp
    # a bit, to avoid issues with the user pressing Control-C while some ops are pending
    #
    # this works, because oplog entries are idempotent
    start_ts_orig = state_db.get_oplog_ts(source, dest)
    start_ts = bson.Timestamp(time=start_ts_orig.time - TS_REWIND, inc=0)
    log.info("starting apply at %s", start_ts)

    # perform tailing oplog query using the oplog_replay option to efficiently find
    # our starting position in the oplog
    query = {}
    query["ts"] = {"$gte": start_ts}
    query["ns"] = source_collection.full_name
    cursor = oplog.find(query, cursor_type=pymongo.CursorType.TAILABLE_AWAIT, oplog_replay=True)
    #    cursor.add_option(pymongo.cursor._QUERY_OPTIONS['oplog_replay'])
    print cursor
    while True:
        for op in cursor:
            stats.paused = False

            _id = _op_id(op)
            if percent and not utils.id_in_subset(_id, percent):
                continue

            stats.ops_retrieved += 1

            # block *all* further ops from being applied if there's a pending
            # op on the current _id, to ensure serialization
            while _id in stats.pending_ids:
                gevent.sleep(0.1)
                stats.sleeps += 1

            # do the real oplog work in a greenlet from the pool
            stats.pending_ids.add(_id)
            apply_workers.spawn(_apply_op_worker, op, source_collection, dest_collection, stats)

            # update our last timestamp; this is *not* guaranteed to be the timestamp of the
            # most recent op, which is impossible because of our out-of-order execution
            #
            # this is an approximation that needs to be accurate to within TS_REWIND seconds
            stats.last_ts = op["ts"]

        # while we have a tailable cursor, it can stop iteration if no more results come back
        # in a reasonable time, so sleep for a bit then try to continue iteration
        if cursor.alive:
            log.debug("replayed all oplog entries; sleeping...")
            stats.paused = True
            gevent.sleep(2)
            stats.paused = False
        else:
            log.error("cursor died on us!")
            break

    # just to silence pyflakes...
    stats_greenlet.kill()
    checkpoint_greenlet.kill()
Example #18
0
def compare_collections(source, dest, percent, error_bp, recent_ops, ids_file):
    """
    compares two collections, using retries to see if collections are eventually consistent

    @param source_collection   source for data
    @param dest_collection     copied data to verify
    @param percent             percentage of documents to verify
    @param ids_file            files containing querie
    """
    MismatchLogger.collection_name = source['collection']

    # setup client connections
    source_client = utils.mongo_connect(source['host'], source['port'],
                                        ensure_direct=True,
                                        max_pool_size=POOL_SIZE,
                                        slave_okay=True,
                                        document_class=dict)
    source_collection = source_client[source['db']][source['collection']]

    dest_client = utils.mongo_connect(dest['host'], dest['port'],
                                      ensure_direct=True,
                                      max_pool_size=POOL_SIZE,
                                      slave_okay=True,
                                      document_class=dict)

    dest_collection = dest_client[dest['db']][dest['collection']]

    # setup stats
    stats = CompareStats()
    compare_pool = gevent.pool.Pool(POOL_SIZE)
    retry_pool = gevent.pool.Pool(POOL_SIZE * 5)

    # get just _id's first, because long-running queries degrade significantly
    # over time; reading just _ids is fast enough (or small enough?) not to suffer
    # from this degradation
    if recent_ops:
        id_getter = _get_ids_for_recent_ops(source_client, recent_ops)
        stats.total_docs = recent_ops
        if source_client.is_mongos:
            log.error("cannot read oplogs through mongos; specify mongod instances instead")
            return
    elif ids_file:
        id_getter = _get_ids_in_file(ids_file)
        stats.total_docs = id_getter.next()
    else:
        id_getter = _get_all_ids(source_collection)
        stats.total_docs = source_collection.count()

    if percent is not None:
        stats.total_docs = int(float(stats.total_docs) * percent / 100.0)

    stats_greenlet = gevent.spawn(_stats_worker, stats)

    # read documents in batches, but perform retries individually in separate greenlets
    _ids = []
    for _id in id_getter:
        if percent is not None and not utils.id_in_subset(_id, percent):
            continue

        _ids.append(_id)
        if len(_ids) == READ_SIZE:
            _ids_to_compare = _ids
            _ids = []
            compare_pool.spawn(_compare_ids_worker,
                               _ids=_ids_to_compare,
                               source_collection=source_collection,
                               dest_collection=dest_collection,
                               stats=stats,
                               retry_pool=retry_pool)

    # compare final batch of _id's
    if _ids:
        compare_pool.spawn(_compare_ids_worker,
                           _ids=_ids,
                           source_collection=source_collection,
                           dest_collection=dest_collection,
                           stats=stats,
                           retry_pool=retry_pool)

    # wait for all greenlets to finish
    compare_pool.join()
    retry_pool.join()
    stats_greenlet.kill()
    stats.log()
    log.info("compare finished")
Example #19
0
def copy_collection(source, dest, state_path, percent):
    """
    Copies all documents from source to destination collection. Inserts documents in
    batches using insert workers, which are each run in their own greenlet. Ensures that
    the destination is empty before starting the copy.

    Does no safety checks -- this is up to the caller.

    @param source      dict of (host, port, db, collection) for the source
    @param dest        dict of (host, port, db, collection) for the destination
    @param state_path  path of state database
    @param percent     percentage of documents to copy
    """
    gevent.monkey.patch_socket()

    # open state database
    state_db = CopyStateDB(state_path)

    # connect to mongo
    source_client = utils.mongo_connect(
        source['host'],
        source['port'],
        ensure_direct=True,
        max_pool_size=30,
        read_preference=ReadPreference.SECONDARY,
        document_class=FasterOrderedDict)

    source_collection = source_client[source['db']][source['collection']]
    if source_client.is_mongos:
        raise Exception(
            "for performance reasons, sources must be mongod instances; %s:%d is not",
            source['host'], source['port'])

    dest_client = utils.mongo_connect(dest['host'],
                                      dest['port'],
                                      max_pool_size=30,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[dest['db']][dest['collection']]

    # record timestamp of last oplog entry, so that we know where to start applying ops
    # later
    oplog_ts = utils.get_last_oplog_entry(source_client)['ts']
    state_db.update_oplog_ts(source, dest, oplog_ts)

    # for testing copying of indices quickly
    if percent == 0:
        log.info("skipping copy because of --percent 0 parameters")
        state_db.update_state(source, dest,
                              CopyStateDB.STATE_WAITING_FOR_INDICES)
        return

    stats = Stats()
    stats.total_docs = int(source_collection.count())
    if percent:
        # hack-ish but good enough for a testing-only feature
        stats.total_docs = int(stats.total_docs * (float(percent) / 100.0))

    # get all _ids, which works around a mongo bug/feature that causes massive slowdowns
    # of long-running, large reads over time
    ids = []
    cursor = source_collection.find(fields=["_id"],
                                    snapshot=True,
                                    timeout=False)
    cursor.batch_size(5000)
    insert_pool = Pool(INSERT_POOL_SIZE)
    stats_greenlet = gevent.spawn(_copy_stats_worker, stats)
    for doc in cursor:
        _id = doc['_id']

        if percent is not None and not utils.id_in_subset(_id, percent):
            continue

        # when we've gathered enough _ids, spawn a worker greenlet to batch copy the
        # documents corresponding to them
        ids.append(_id)
        if len(ids) % INSERT_SIZE == 0:
            outgoing_ids = ids
            ids = []
            insert_pool.spawn(_find_and_insert_batch_worker,
                              source_collection=source_collection,
                              dest_collection=dest_collection,
                              ids=outgoing_ids,
                              stats=stats)
        gevent.sleep()

    # insert last batch of documents
    if len(ids) > 0:
        _find_and_insert_batch_worker(source_collection=source_collection,
                                      dest_collection=dest_collection,
                                      ids=ids,
                                      stats=stats)
        stats.log()

    # wait until all other outstanding inserts have finished
    insert_pool.join()
    stats_greenlet.kill()
    log.info("done with initial copy")

    state_db.update_state(source, dest, CopyStateDB.STATE_WAITING_FOR_INDICES)
Example #20
0
    parser.add_argument(
        '--mismatches-file',
        type=str,
        default=None,
        required=True,
        metavar='FILENAME',
        help=
        'read ids to copy from this file, which is generated by compare_collections.py'
    )
    args = parser.parse_args()

    # connect to source and destination
    source = utils.parse_mongo_url(args.source)
    source_client = utils.mongo_connect(
        source,
        ensure_direct=True,
        maxPoolSize=POOL_SIZE,
        read_preference=ReadPreference.SECONDARY_PREFERRED,
        document_class=FasterOrderedDict)
    source_collection = source_client[source['db']][source['collection']]
    if not source_client.is_mongos or source_client.is_primary:
        raise Exception("source must be a mongos instance or a primary")

    dest = utils.parse_mongo_url(args.dest)
    dest_client = utils.mongo_connect(dest,
                                      maxPoolSize=POOL_SIZE,
                                      document_class=FasterOrderedDict)
    dest_collection = dest_client[dest['db']][dest['collection']]

    if source == dest:
        raise ValueError("source and destination cannot be the same!")