Beispiel #1
0
 def get_all_ns():
     ns_set = []
     gridfs_ns_set = []
     db_list = retry_until_ok(self.primary_client.database_names)
     for database in db_list:
         if database == "config" or database == "local":
             continue
         coll_list = retry_until_ok(
             self.primary_client[database].collection_names)
         for coll in coll_list:
             # ignore system collections
             if coll.startswith("system."):
                 continue
             # ignore gridfs chunks collections
             if coll.endswith(".chunks"):
                 continue
             if coll.endswith(".files"):
                 namespace = "%s.%s" % (database, coll)
                 namespace = namespace[:-len(".files")]
                 if self.namespace_config.gridfs_namespace(namespace):
                     gridfs_ns_set.append(namespace)
             else:
                 namespace = "%s.%s" % (database, coll)
                 if self.namespace_config.map_namespace(namespace):
                     ns_set.append(namespace)
     return ns_set, gridfs_ns_set
        def docs_to_dump():
            for namespace in dump_set:
                logging.info("dumping collection %s" % namespace)
                database, coll = namespace.split('.', 1)
                last_id = None
                attempts = 0

                # Loop to handle possible AutoReconnect
                while attempts < 60:
                    target_coll = self.main_connection[database][coll]
                    if not last_id:
                        cursor = util.retry_until_ok(
                            target_coll.find,
                            sort=[("_id", pymongo.ASCENDING)]
                        )
                    else:
                        cursor = util.retry_until_ok(
                            target_coll.find,
                            {"_id": {"$gt": last_id}},
                            sort=[("_id", pymongo.ASCENDING)]
                        )
                    try:
                        for doc in cursor:
                            if not self.running:
                                raise StopIteration
                            doc["ns"] = self.dest_mapping.get(
                                namespace, namespace)
                            doc["_ts"] = long_ts
                            last_id = doc["_id"]
                            yield doc
                        break
                    except pymongo.errors.AutoReconnect:
                        attempts += 1
                        time.sleep(1)
Beispiel #3
0
 def docs_to_dump(from_coll):
     last_id = None
     attempts = 0
     projection = self.namespace_config.projection(from_coll.full_name)
     # Loop to handle possible AutoReconnect
     while attempts < 60:
         if last_id is None:
             cursor = retry_until_ok(
                 from_coll.find,
                 projection=projection,
                 sort=[("_id", pymongo.ASCENDING)]
             )
         else:
             cursor = retry_until_ok(
                 from_coll.find,
                 {"_id": {"$gt": last_id}},
                 projection=projection,
                 sort=[("_id", pymongo.ASCENDING)]
             )
         try:
             for doc in cursor:
                 if not self.running:
                     # Thread was joined while performing the
                     # collection dump.
                     dump_cancelled[0] = True
                     raise StopIteration
                 last_id = doc["_id"]
                 yield doc
             break
         except (pymongo.errors.AutoReconnect,
                 pymongo.errors.OperationFailure):
             attempts += 1
             time.sleep(1)
    def test_deletions(self):
        """Test rolling back 'd' operations"""

        self.opman.start()

        # Insert a document, wait till it replicates to secondary
        self.main_conn["test"]["mc"].insert_one({"i": 0})
        self.main_conn["test"]["mc"].insert_one({"i": 1})
        self.assertEqual(self.primary_conn["test"]["mc"].find().count(), 2)
        assert_soon(
            lambda: self.secondary_conn["test"]["mc"].count() == 2,
            "first write didn't replicate to secondary",
        )

        # Kill the primary, wait for secondary to be promoted
        self.repl_set.primary.stop(destroy=False)
        assert_soon(
            lambda: self.secondary_conn["admin"].command("isMaster")["ismaster"]
        )

        # Delete first document
        retry_until_ok(self.main_conn["test"]["mc"].delete_one, {"i": 0})
        self.assertEqual(self.secondary_conn["test"]["mc"].count(), 1)

        # Wait for replication to doc manager
        assert_soon(
            lambda: len(self.opman.doc_managers[0]._search()) == 1,
            "delete was not replicated to doc manager",
        )

        # Kill the new primary
        self.repl_set.secondary.stop(destroy=False)

        # Start both servers back up
        self.repl_set.primary.start()
        primary_admin = self.primary_conn["admin"]
        assert_soon(
            lambda: primary_admin.command("isMaster")["ismaster"],
            "restarted primary never resumed primary status",
        )
        self.repl_set.secondary.start()
        assert_soon(
            lambda: retry_until_ok(
                self.secondary_conn.admin.command, "replSetGetStatus"
            )["myState"]
            == 2,
            "restarted secondary never resumed secondary status",
        )

        # Both documents should exist in mongo
        assert_soon(lambda: retry_until_ok(self.main_conn["test"]["mc"].count) == 2)

        # Both document should exist in doc manager
        doc_manager = self.opman.doc_managers[0]
        assert_soon(
            lambda: len(list(doc_manager._search())) == 2,
            ("Expected two documents, but got: %r" % list(doc_manager._search())),
        )

        self.opman.join()
Beispiel #5
0
    def test_rollback(self):
        """Tests rollback. We force a rollback by adding a doc, killing the
            primary, adding another doc, killing the new primary, and then
            restarting both.
        """

        primary_conn = Connection(HOSTNAME, int(PORTS_ONE['PRIMARY']))

        self.conn['test']['test'].insert({'name': 'paul'}, safe=True)
        while self.conn['test']['test'].find({'name': 'paul'}).count() != 1:
            time.sleep(1)
        while len(self.mongo_doc._search()) != 1:
            time.sleep(1)

        kill_mongo_proc(HOSTNAME, PORTS_ONE['PRIMARY'])

        new_primary_conn = Connection(HOSTNAME, int(PORTS_ONE['SECONDARY']))

        admin = new_primary_conn['admin']
        while admin.command("isMaster")['ismaster'] is False:
            time.sleep(1)
        time.sleep(5)
        count = 0
        while True:
            try:
                result_set_1 = self.conn['test']['test'].insert(
                    {'name': 'pauline'}, safe=True)
                break
            except OperationFailure:
                time.sleep(1)
                count += 1
                if count >= 60:
                    sys.exit(1)
                continue
        while(len(self.mongo_doc._search()) != 2):
            time.sleep(1)
        result_set_1 = self.mongo_doc._search()
        result_set_2 = self.conn['test']['test'].find_one({'name': 'pauline'})
        self.assertEqual(len(result_set_2), 2)
                #make sure pauling is there
        for item in result_set_1:
            if item['name'] == 'pauline':
                self.assertEqual(item['_id'], result_set_2['_id'])
        kill_mongo_proc(HOSTNAME, PORTS_ONE['SECONDARY'])

        start_mongo_proc(PORTS_ONE['PRIMARY'], "demo-repl", "/replset1a",
                       "/replset1a.log", None)
        while primary_conn['admin'].command("isMaster")['ismaster'] is False:
            time.sleep(1)

        start_mongo_proc(PORTS_ONE['SECONDARY'], "demo-repl", "/replset1b",
                       "/replset1b.log", None)

        time.sleep(2)
        result_set_1 = self.mongo_doc._search()
        self.assertEqual(len(result_set_1), 1)
        for item in result_set_1:
            self.assertEqual(item['name'], 'paul')
        find_cursor = retry_until_ok(self.conn['test']['test'].find)
        self.assertEqual(retry_until_ok(find_cursor.count), 1)
        def docs_to_dump(namespace):
            database, coll = namespace.split('.', 1)
            last_id = None
            attempts = 0

            # Loop to handle possible AutoReconnect
            while attempts < 60:
                target_coll = self.primary_client[database][coll]
                if not last_id:
                    cursor = util.retry_until_ok(
                        target_coll.find,
                        projection=self._projection,
                        sort=[("_id", pymongo.ASCENDING)]
                    )
                else:
                    cursor = util.retry_until_ok(
                        target_coll.find,
                        {"_id": {"$gt": last_id}},
                        projection=self._projection,
                        sort=[("_id", pymongo.ASCENDING)]
                    )
                try:
                    for doc in cursor:
                        if not self.running:
                            raise StopIteration
                        last_id = doc["_id"]
                        yield doc
                    break
                except (pymongo.errors.AutoReconnect,
                        pymongo.errors.OperationFailure):
                    attempts += 1
                    time.sleep(1)
    def test_elastic_commit(self):
        """Test the auto_commit_interval attribute."""
        doc = {'_id': '3', 'name': 'Waldo'}

        # test cases:
        # None = no autocommit
        # 0 = commit immediately
        # x > 0 = commit within x seconds
        for commit_interval in [None, 0, 2, 8]:
            docman = DocManager(elastic_pair,
                                auto_commit_interval=commit_interval)
            docman.upsert(doc, *TESTARGS)
            if commit_interval:
                # Allow just a little extra time
                time.sleep(commit_interval + 2)
            results = list(self._search())
            if commit_interval is None:
                self.assertEqual(len(results), 0,
                                 "should not commit document with "
                                 "auto_commit_interval = None")
            else:
                self.assertEqual(len(results), 1,
                                 "should commit document with "
                                 "auto_commit_interval = %s" % (
                                 commit_interval,))
                self.assertEqual(results[0]["name"], "Waldo")
            docman.stop()
            self._remove()
            retry_until_ok(self.elastic_conn.indices.refresh, index="")
    def test_stressed_rollback(self):
        """Test stressed rollback with number of documents equal to specified
        in global variable. Rollback is performed like before, but with more
            documents.
        """
        while len(self.synchronizer._search()) != 0:
            time.sleep(1)
        for i in range(0, NUMBER_OF_DOC_DIRS):
            self.conn['test']['test'].insert(
                {'name': 'Paul ' + str(i)}, safe=True)

        while len(self.synchronizer._search()) != NUMBER_OF_DOC_DIRS:
            time.sleep(1)
        primary_conn = Connection(HOSTNAME, int(PORTS_ONE['PRIMARY']))
        kill_mongo_proc(HOSTNAME, PORTS_ONE['PRIMARY'])

        new_primary_conn = Connection(HOSTNAME, int(PORTS_ONE['SECONDARY']))
        admin_db = new_primary_conn['admin']
        while admin_db.command("isMaster")['ismaster'] is False:
            time.sleep(1)
        time.sleep(5)
        count = -1
        while count + 1 < NUMBER_OF_DOC_DIRS:
            try:
                count += 1
                self.conn['test']['test'].insert({'name': 'Pauline ' 
                    + str(count)}, safe=True)
            except (OperationFailure, AutoReconnect):
                time.sleep(1)
        while (len(self.synchronizer._search()) 
                != self.conn['test']['test'].find().count()):
            time.sleep(1)
        result_set_1 = self.synchronizer._search()
        i = 0
        for item in result_set_1:
            if 'Pauline' in item['name']:
                result_set_2 = self.conn['test']['test'].find_one(
                    {'name': item['name']})
                self.assertEqual(item['_id'], result_set_2['_id'])

        kill_mongo_proc(HOSTNAME, PORTS_ONE['SECONDARY'])

        start_mongo_proc(PORTS_ONE['PRIMARY'], "demo-repl", "/replset1a",
                       "/replset1a.log", None)
        while primary_conn['admin'].command("isMaster")['ismaster'] is False:
            time.sleep(1)

        start_mongo_proc(PORTS_ONE['SECONDARY'], "demo-repl", "/replset1b",
                       "/replset1b.log", None)

        while (len(self.synchronizer._search()) != NUMBER_OF_DOC_DIRS):
            time.sleep(5)

        result_set_1 = self.synchronizer._search()
        self.assertEqual(len(result_set_1), NUMBER_OF_DOC_DIRS)
        for item in result_set_1:
            self.assertTrue('Paul' in item['name'])
        find_cursor = retry_until_ok(self.conn['test']['test'].find)
        self.assertEqual(retry_until_ok(find_cursor.count), NUMBER_OF_DOC_DIRS)
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.

        Returns the cursor and the number of documents left in the cursor.
        """

        timestamp = self.checkpoint
        if self.checkpoint is None:
            if self.collection_dump:
                # dump collection and update checkpoint
                timestamp = self.dump_collection()
                if timestamp is None:
                    return None, 0
            else:
                # Collection dump disabled:
                # return cursor to beginning of oplog.
                cursor = self.get_oplog_cursor()
                self.checkpoint = self.get_last_oplog_timestamp()
                return cursor, retry_until_ok(cursor.count)

        for i in range(60):
            cursor = self.get_oplog_cursor(timestamp)
            cursor_len = retry_until_ok(cursor.count)

            if cursor_len == 0:
                # rollback, update checkpoint, and retry
                logging.debug("OplogThread: Initiating rollback from "
                              "get_oplog_cursor")
                self.checkpoint = self.rollback()
                return self.init_cursor()

            # try to get the first oplog entry
            try:
                first_oplog_entry = next(cursor)
            except StopIteration:
                # It's possible for the cursor to become invalid
                # between the cursor.count() call and now
                time.sleep(1)
                continue

            # first entry should be last oplog entry processed
            cursor_ts_long = util.bson_ts_to_long(
                first_oplog_entry.get("ts"))
            given_ts_long = util.bson_ts_to_long(timestamp)
            if cursor_ts_long > given_ts_long:
                # first entry in oplog is beyond timestamp
                # we've fallen behind
                return None, 0

            # first entry has been consumed
            return cursor, cursor_len - 1

        else:
            raise errors.MongoConnectorError(
                "Could not initialize oplog cursor.")
    def test_single_target(self):
        """Test with a single replication target"""

        self.opman.start()

        # Insert first document with primary up
        self.main_conn["test"]["mc"].insert({"i": 0})
        self.assertEqual(self.primary_conn["test"]["mc"].find().count(), 1)

        # Make sure the insert is replicated
        secondary = self.secondary_conn
        assert_soon(lambda: secondary["test"]["mc"].count() == 1, "first write didn't replicate to secondary")

        # Kill the primary
        self.repl_set.primary.stop(destroy=False)

        # Wait for the secondary to be promoted
        assert_soon(lambda: secondary["admin"].command("isMaster")["ismaster"])

        # Insert another document. This will be rolled back later
        retry_until_ok(self.main_conn["test"]["mc"].insert, {"i": 1})
        self.assertEqual(secondary["test"]["mc"].count(), 2)

        # Wait for replication to doc manager
        assert_soon(
            lambda: len(self.opman.doc_managers[0]._search()) == 2, "not all writes were replicated to doc manager"
        )

        # Kill the new primary
        self.repl_set.secondary.stop(destroy=False)

        # Start both servers back up
        self.repl_set.primary.start()
        primary_admin = self.primary_conn["admin"]
        assert_soon(
            lambda: primary_admin.command("isMaster")["ismaster"], "restarted primary never resumed primary status"
        )
        self.repl_set.secondary.start()
        assert_soon(
            lambda: retry_until_ok(secondary.admin.command, "replSetGetStatus")["myState"] == 2,
            "restarted secondary never resumed secondary status",
        )
        assert_soon(
            lambda: retry_until_ok(self.main_conn.test.mc.find().count) > 0,
            "documents not found after primary/secondary restarted",
        )

        # Only first document should exist in MongoDB
        self.assertEqual(self.main_conn["test"]["mc"].count(), 1)
        self.assertEqual(self.main_conn["test"]["mc"].find_one()["i"], 0)

        # Same case should hold for the doc manager
        doc_manager = self.opman.doc_managers[0]
        assert_soon(lambda: len(doc_manager._search()) == 1, "documents never rolled back in doc manager.")
        self.assertEqual(doc_manager._search()[0]["i"], 0)

        # cleanup
        self.opman.join()
    def test_rollback(self):
        """Test behavior during a MongoDB rollback.

        We force a rollback by adding a doc, killing the primary,
        adding another doc, killing the new primary, and then
        restarting both.
        """
        primary_conn = self.repl_set.primary.client()

        # This doc can be picked up in the collection dump
        self.conn['test']['test'].insert_one({'name': 'paul'})
        condition1 = lambda: self.conn['test']['test'].find(
            {'name': 'paul'}).count() == 1
        condition2 = lambda: self._count() == 1
        assert_soon(condition1)
        assert_soon(condition2)

        # This doc is definitely not picked up by collection dump
        self.conn['test']['test'].insert_one({'name': 'pauly'})

        self.repl_set.primary.stop(destroy=False)

        new_primary_conn = self.repl_set.secondary.client()

        admin = new_primary_conn['admin']
        assert_soon(lambda: admin.command("isMaster")['ismaster'])
        time.sleep(5)
        retry_until_ok(self.conn.test.test.insert_one,
                       {'name': 'pauline'})
        assert_soon(lambda: self._count() == 3)
        result_set_1 = list(self._search())
        result_set_2 = self.conn['test']['test'].find_one({'name': 'pauline'})
        self.assertEqual(len(result_set_1), 3)
        #make sure pauline is there
        for item in result_set_1:
            if item['name'] == 'pauline':
                self.assertEqual(item['_id'], str(result_set_2['_id']))
        self.repl_set.secondary.stop(destroy=False)

        self.repl_set.primary.start()
        while primary_conn['admin'].command("isMaster")['ismaster'] is False:
            time.sleep(1)

        self.repl_set.secondary.start()

        time.sleep(2)
        result_set_1 = list(self._search())
        self.assertEqual(len(result_set_1), 2)

        if result_set_1[0]['name'] == 'paul':
            self.assertEqual(result_set_1[1]['name'], 'pauly')
        elif result_set_1[0]['name'] == 'pauly':
            self.assertEqual(result_set_1[1]['name'], 'paul')
        else:
            self.assertTrue(0, 'Unknown document retrieved')

        find_cursor = retry_until_ok(self.conn['test']['test'].find)
        self.assertEqual(retry_until_ok(find_cursor.count), 2)
Beispiel #12
0
    def test_stressed_rollback(self):
        """Test stressed rollback with number of documents equal to specified
            in global variable. Strategy for rollback is the same as before.
        """

        for i in range(0, NUMBER_OF_DOC_DIRS):
            self.conn['test']['test'].insert({'name': 'Paul ' + str(i)},
                safe=True)

        search = self.mongo_doc._search
        condition = lambda : sum(1 for _ in search()) == NUMBER_OF_DOC_DIRS
        wait_for(condition)
        primary_conn = Connection(HOSTNAME, int(PORTS_ONE['PRIMARY']))
        kill_mongo_proc(HOSTNAME, PORTS_ONE['PRIMARY'])

        new_primary_conn = Connection(HOSTNAME, int(PORTS_ONE['SECONDARY']))

        admin = new_primary_conn['admin']
        wait_for(lambda : admin.command("isMaster")['ismaster'])

        time.sleep(5)
        count = -1
        while count + 1 < NUMBER_OF_DOC_DIRS:
            try:
                count += 1
                self.conn['test']['test'].insert({'name': 'Pauline ' +
                    str(count)}, safe=True)
            except (OperationFailure, AutoReconnect):
                time.sleep(1)
        wait_for(lambda : sum(1 for _ in self.mongo_doc._search())
                 == self.conn['test']['test'].find().count())
        result_set_1 = self.mongo_doc._search()
        for item in result_set_1:
            if 'Pauline' in item['name']:
                result_set_2 = self.conn['test']['test'].find_one(
                    {'name': item['name']})
                self.assertEqual(item['_id'], result_set_2['_id'])

        kill_mongo_proc(HOSTNAME, PORTS_ONE['SECONDARY'])

        start_mongo_proc(PORTS_ONE['PRIMARY'], "demo-repl", "/replset1a",
                       "/replset1a.log", None)
        db_admin = primary_conn['admin']
        wait_for(lambda : db_admin.command("isMaster")['ismaster'])
        start_mongo_proc(PORTS_ONE['SECONDARY'], "demo-repl", "/replset1b",
                       "/replset1b.log", None)

        search = self.mongo_doc._search
        condition = lambda : sum(1 for _ in search()) == NUMBER_OF_DOC_DIRS
        wait_for(condition)

        result_set_1 = list(self.mongo_doc._search())
        self.assertEqual(len(result_set_1), NUMBER_OF_DOC_DIRS)
        for item in result_set_1:
            self.assertTrue('Paul' in item['name'])
        find_cursor = retry_until_ok(self.conn['test']['test'].find)
        self.assertEqual(retry_until_ok(find_cursor.count), NUMBER_OF_DOC_DIRS)
    def test_rollback(self):
        """Tests rollback. Rollback is performed by inserting one document,
            killing primary, inserting another doc, killing secondary,
            and then restarting both.
        """
        primary_conn = Connection(HOSTNAME, int(PORTS_ONE['PRIMARY']))

        self.conn['test']['test'].insert({'name': 'paul'}, safe=True)
        while self.conn['test']['test'].find({'name': 'paul'}).count() != 1:
            time.sleep(1)

        kill_mongo_proc(HOSTNAME, PORTS_ONE['PRIMARY'])

        new_primary_conn = Connection(HOSTNAME, int(PORTS_ONE['SECONDARY']))
        admin_db = new_primary_conn['admin']

        while admin_db.command("isMaster")['ismaster'] is False:
            time.sleep(1)
        time.sleep(5)
        count = 0
        while True:
            try:
                self.conn['test']['test'].insert({'name': 'pauline'}, safe=True)
                break
            except OperationFailure:
                count += 1
                if count > 60:
                    self.fail('Call to insert failed too'
                    ' many times in test_rollback')
                time.sleep(1)
                continue
        while (len(self.synchronizer._search()) != 2):
            time.sleep(1)
        result_set_1 = self.synchronizer._search()
        result_set_2 = self.conn['test']['test'].find_one({'name': 'pauline'})
        self.assertEqual(len(result_set_1), 2)
        for item in result_set_1:
            if item['name'] == 'pauline':
                self.assertEqual(item['_id'], result_set_2['_id'])
        kill_mongo_proc(HOSTNAME, PORTS_ONE['SECONDARY'])

        start_mongo_proc(PORTS_ONE['PRIMARY'], "demo-repl", "/replset1a",
                       "/replset1a.log", None)
        while primary_conn['admin'].command("isMaster")['ismaster'] is False:
            time.sleep(1)

        start_mongo_proc(PORTS_ONE['SECONDARY'], "demo-repl", "/replset1b",
                       "/replset1b.log", None)

        time.sleep(2)
        result_set_1 = self.synchronizer._search()
        self.assertEqual(len(result_set_1), 1)
        for item in result_set_1:
            self.assertEqual(item['name'], 'paul')
        find_cursor = retry_until_ok(self.conn['test']['test'].find)
        self.assertEqual(retry_until_ok(find_cursor.count), 1)
 def test_retry_until_ok_authorization_mongodb_24(self):
     """Test retry_until_ok does not mask authorization failures in
     MongoDB 2.4.
     """
     with self.assertRaises(errors.OperationFailure):
         retry_until_ok(
             err_func,
             errors.OperationFailure("", details={"errmsg": "unauthorized"}),
         )
     self.assertEqual(err_func.counter, 1)
    def test_single_target(self):
        """Test with a single replication target"""

        self.opman.start()

        # Insert first document with primary up
        self.main_conn["test"]["mc"].insert({"i": 0})
        self.assertEqual(self.primary_conn["test"]["mc"].find().count(), 1)

        # Make sure the insert is replicated
        secondary = self.secondary_conn
        self.assertTrue(wait_for(lambda: secondary["test"]["mc"].count() == 1),
                        "first write didn't replicate to secondary")

        # Kill the primary
        kill_mongo_proc(self.primary_p, destroy=False)

        # Wait for the secondary to be promoted
        while not secondary["admin"].command("isMaster")["ismaster"]:
            time.sleep(1)

        # Insert another document. This will be rolled back later
        retry_until_ok(self.main_conn["test"]["mc"].insert, {"i": 1})
        self.assertEqual(secondary["test"]["mc"].count(), 2)

        # Wait for replication to doc manager
        c = lambda: len(self.opman.doc_managers[0]._search()) == 2
        self.assertTrue(wait_for(c),
                        "not all writes were replicated to doc manager")

        # Kill the new primary
        kill_mongo_proc(self.secondary_p, destroy=False)

        # Start both servers back up
        restart_mongo_proc(self.primary_p)
        primary_admin = self.primary_conn["admin"]
        while not primary_admin.command("isMaster")["ismaster"]:
            time.sleep(1)
        restart_mongo_proc(self.secondary_p)
        while secondary["admin"].command("replSetGetStatus")["myState"] != 2:
            time.sleep(1)
        while retry_until_ok(self.main_conn["test"]["mc"].find().count) == 0:
            time.sleep(1)

        # Only first document should exist in MongoDB
        self.assertEqual(self.main_conn["test"]["mc"].count(), 1)
        self.assertEqual(self.main_conn["test"]["mc"].find_one()["i"], 0)

        # Same case should hold for the doc manager
        doc_manager = self.opman.doc_managers[0]
        self.assertEqual(len(doc_manager._search()), 1)
        self.assertEqual(doc_manager._search()[0]["i"], 0)

        # cleanup
        self.opman.join()
Beispiel #16
0
    def test_stressed_rollback(self):
        """Test stressed rollback with number of documents equal to specified
            in global variable. Strategy for rollback is the same as before.
        """

        for i in range(0, STRESS_COUNT):
            self.conn['test']['test'].insert({'name': 'Paul ' + str(i)})

        search = self.mongo_doc._search
        condition = lambda: sum(1 for _ in search()) == STRESS_COUNT
        assert_soon(condition)
        primary_conn = MongoClient(mongo_host, self.primary_p)

        kill_mongo_proc(self.primary_p, destroy=False)

        new_primary_conn = MongoClient(mongo_host, self.secondary_p)

        admin = new_primary_conn['admin']
        assert_soon(lambda: admin.command("isMaster")['ismaster'])

        time.sleep(5)
        count = -1
        while count + 1 < STRESS_COUNT:
            try:
                count += 1
                self.conn['test']['test'].insert(
                    {'name': 'Pauline ' + str(count)})
            except (OperationFailure, AutoReconnect):
                time.sleep(1)
        assert_soon(lambda: sum(1 for _ in self.mongo_doc._search())
                    == self.conn['test']['test'].find().count())
        result_set_1 = self.mongo_doc._search()
        for item in result_set_1:
            if 'Pauline' in item['name']:
                result_set_2 = self.conn['test']['test'].find_one(
                    {'name': item['name']})
                self.assertEqual(item['_id'], result_set_2['_id'])

        kill_mongo_proc(self.secondary_p, destroy=False)

        restart_mongo_proc(self.primary_p)
        db_admin = primary_conn['admin']
        assert_soon(lambda: db_admin.command("isMaster")['ismaster'])
        restart_mongo_proc(self.secondary_p)

        search = self.mongo_doc._search
        condition = lambda: sum(1 for _ in search()) == STRESS_COUNT
        assert_soon(condition)

        result_set_1 = list(self.mongo_doc._search())
        self.assertEqual(len(result_set_1), STRESS_COUNT)
        for item in result_set_1:
            self.assertTrue('Paul' in item['name'])
        find_cursor = retry_until_ok(self.conn['test']['test'].find)
        self.assertEqual(retry_until_ok(find_cursor.count), STRESS_COUNT)
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        dump_set = self.namespace_set

        #no namespaces specified
        if not self.namespace_set:
            db_list = self.main_connection.database_names()
            for database in db_list:
                if database == "config" or database == "local":
                    continue
                coll_list = self.main_connection[database].collection_names()
                for coll in coll_list:
                    if coll.startswith("system"):
                        continue
                    namespace = str(database) + "." + str(coll)
                    dump_set.append(namespace)

        timestamp = util.retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        for namespace in dump_set:
            database, coll = namespace.split('.', 1)
            target_coll = self.main_connection[database][coll]
            cursor = util.retry_until_ok(target_coll.find)
            long_ts = util.bson_ts_to_long(timestamp)

            try:
                for doc in cursor:
                    # Could spend a long time in this loop
                    if not self.running:
                        # Return None so we don't save our progress
                        return None
                    doc['ns'] = namespace
                    doc['_ts'] = long_ts
                    try:
                        self.doc_manager.upsert(doc)
                    except errors.OperationFailed:
                        logging.error("Unable to insert %s" % (doc))
            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure):

                err_msg = "OplogManager: Failed during dump collection"
                effect = "cannot recover!"
                logging.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                return

        return timestamp
Beispiel #18
0
    def test_retry_until_ok(self):
        """Test retry_until_ok
        """

        self.assertTrue(retry_until_ok(err_func))
        self.assertEqual(err_func.counter, 3)

        # RuntimeError should not be caught
        def raise_runtime_error():
            raise RuntimeError
        with self.assertRaises(RuntimeError):
            retry_until_ok(raise_runtime_error)
    def test_stressed_rollback(self):
        """Stress test for a rollback with many documents."""
        self.opman.start()

        c = self.main_conn.test.mc
        docman = self.opman.doc_managers[0]
        c2 = c.with_options(write_concern=WriteConcern(w=2))
        c2.insert_many([{"i": i} for i in range(STRESS_COUNT)])
        assert_soon(lambda: c2.count() == STRESS_COUNT)

        def condition():
            return len(docman._search()) == STRESS_COUNT

        assert_soon(
            condition,
            (
                "Was expecting %d documents in DocManager, "
                "but %d found instead." % (STRESS_COUNT, len(docman._search()))
            ),
        )

        primary_conn = self.repl_set.primary.client()
        self.repl_set.primary.stop(destroy=False)
        new_primary_conn = self.repl_set.secondary.client()

        admin = new_primary_conn.admin
        assert_soon(lambda: retry_until_ok(admin.command, "isMaster")["ismaster"])

        retry_until_ok(
            c.insert_many, [{"i": str(STRESS_COUNT + i)} for i in range(STRESS_COUNT)]
        )

        self.repl_set.secondary.stop(destroy=False)

        self.repl_set.primary.start()
        admin = primary_conn.admin
        assert_soon(lambda: retry_until_ok(admin.command, "isMaster")["ismaster"])
        self.repl_set.secondary.start()

        assert_soon(lambda: retry_until_ok(c.count) == STRESS_COUNT)
        assert_soon(
            condition,
            (
                "Was expecting %d documents in DocManager, "
                "but %d found instead." % (STRESS_COUNT, len(docman._search()))
            ),
        )

        self.opman.join()
Beispiel #20
0
    def test_rollback(self):
        """Test behavior during a MongoDB rollback.

        We force a rollback by adding a doc, killing the primary,
        adding another doc, killing the new primary, and then
        restarting both.
        """
        primary_conn = self.repl_set.primary.client()

        self.conn['test']['test'].insert({'name': 'paul'})
        condition1 = lambda: self.conn['test']['test'].find(
            {'name': 'paul'}).count() == 1
        condition2 = lambda: self._count() == 1
        assert_soon(condition1)
        assert_soon(condition2)

        self.repl_set.primary.stop(destroy=False)

        new_primary_conn = self.repl_set.secondary.client()

        admin = new_primary_conn['admin']
        assert_soon(lambda: admin.command("isMaster")['ismaster'])
        time.sleep(5)
        retry_until_ok(self.conn.test.test.insert,
                       {'name': 'pauline'})
        assert_soon(lambda: self._count() == 2)
        result_set_1 = list(self._search())
        result_set_2 = self.conn['test']['test'].find_one({'name': 'pauline'})
        self.assertEqual(len(result_set_1), 2)
        #make sure pauline is there
        for item in result_set_1:
            if item['name'] == 'pauline':
                self.assertEqual(item['_id'], str(result_set_2['_id']))
        self.repl_set.secondary.stop(destroy=False)

        self.repl_set.primary.start()
        while primary_conn['admin'].command("isMaster")['ismaster'] is False:
            time.sleep(1)

        self.repl_set.secondary.start()

        time.sleep(2)
        result_set_1 = list(self._search())
        self.assertEqual(len(result_set_1), 1)

        for item in result_set_1:
            self.assertEqual(item['name'], 'paul')
        find_cursor = retry_until_ok(self.conn['test']['test'].find)
        self.assertEqual(retry_until_ok(find_cursor.count), 1)
    def commit(self):
        """Send bulk requests and clear buffer"""
        with self.lock:
            try:
                action_buffer = self.BulkBuffer.get_buffer()
                if action_buffer:
                    successes, errors = bulk(self.elastic, action_buffer)
                    LOG.debug("Bulk successfully done for %d docs" % successes)
                    if errors:
                        LOG.error("Error occurred during bulk to ElasticSearch:"
                                  " %r" % errors)
            except Exception as _:
                LOG.exception("Exception while commiting to Elasticsearch")

        retry_until_ok(self.elastic.indices.refresh, index="")
    def test_auto_send_interval(self):
        """Test the auto_send_interval

        auto_send_interval should control the amount of time to wait before
        sending (but not committing) buffered operations.
        """
        doc = {"_id": "3", "name": "Waldo"}

        # test cases:
        # None, 0 = no auto send
        # x > 0 = send buffered operations within x seconds
        for send_interval in [None, 0, 3, 8]:
            docman = DocManager(
                elastic_pair, autoSendInterval=send_interval, auto_commit_interval=None
            )
            docman.upsert(doc, *TESTARGS)
            if send_interval:
                # Allow just a little extra time
                time.sleep(send_interval + 2)
            results = list(self._search())
            self.assertEqual(
                len(results),
                0,
                "documents should not be commited with "
                "auto_commit_interval=None and auto_commit_interval=%s"
                % (send_interval,),
            )
            # Commit the possibly sent changes and search again
            retry_until_ok(self.elastic_conn.indices.refresh, index="")
            results = list(self._search())
            if not send_interval:
                self.assertEqual(
                    len(results),
                    0,
                    "should not send document with auto_send_interval=%s"
                    % (send_interval,),
                )
            else:
                self.assertEqual(
                    len(results),
                    1,
                    "should send document with auto_send_interval=%s"
                    % (send_interval,),
                )
                self.assertEqual(results[0]["name"], "Waldo")
            docman.stop()
            self._remove()
            retry_until_ok(self.elastic_conn.indices.refresh, index="")
 def log_replication_lag(self):
     checkpoint = self.opman.checkpoint
     if checkpoint is None:
         return
     newest_write = retry_until_ok(self.opman.get_last_oplog_timestamp)
     if newest_write < checkpoint:
         # OplogThread will perform a rollback, don't log anything
         return
     lag_secs = newest_write.time - checkpoint.time
     if lag_secs > 0:
         LOG.info(
             "OplogThread for replica set '%s' is %s seconds behind " "the oplog.",
             self.opman.replset_name,
             lag_secs,
         )
     else:
         lag_inc = newest_write.inc - checkpoint.inc
         if lag_inc > 0:
             LOG.info(
                 "OplogThread for replica set '%s' is %s entries "
                 "behind the oplog.",
                 self.opman.replset_name,
                 lag_inc,
             )
         else:
             LOG.info(
                 "OplogThread for replica set '%s' is up to date " "with the oplog.",
                 self.opman.replset_name,
             )
Beispiel #24
0
 def upsert_each(dm):
     num_failed = 0
     for namespace in dump_set:
         from_coll = self.get_collection(namespace)
         mapped_ns = self.namespace_config.map_namespace(namespace)
         total_docs = retry_until_ok(from_coll.count)
         num = None
         for num, doc in enumerate(docs_to_dump(from_coll)):
             try:
                 dm.upsert(doc, mapped_ns, long_ts)
             except Exception:
                 if self.continue_on_error:
                     LOG.exception(
                         "Could not upsert document: %r" % doc)
                     num_failed += 1
                 else:
                     raise
             if num % 10000 == 0:
                 LOG.info("Upserted %d out of approximately %d docs "
                          "from collection '%s'",
                          num + 1, total_docs, namespace)
         if num is not None:
             LOG.info("Upserted %d out of approximately %d docs from "
                      "collection '%s'",
                      num + 1, total_docs, namespace)
     if num_failed > 0:
         LOG.error("Failed to upsert %d docs" % num_failed)
    def retrieve_doc(self, entry):
        """Given the doc ID's, retrieve those documents from the mongos.
        """

        if not entry:
            return None

        namespace = entry['ns']

        # Update operations don't have an 'o' field specifying the document
        #- instead it specifies
        # the changes. So we use 'o2' for updates to get the doc_id later.

        if 'o2' in entry:
            doc_field = 'o2'
        else:
            doc_field = 'o'

        doc_id = entry[doc_field]['_id']
        db_name, coll_name = namespace.split('.', 1)

        coll = self.main_connection[db_name][coll_name]
        doc = util.retry_until_ok(coll.find_one, {'_id': doc_id})

        return doc
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.
        """
        logging.debug("OplogThread: Initializing the oplog cursor.")
        timestamp = self.read_last_checkpoint()

        if timestamp is None and self.collection_dump:
            timestamp = self.dump_collection()
            if timestamp:
                msg = "Dumped collection into target system"
                logging.info('OplogThread: %s %s'
                             % (self.oplog, msg))
        elif timestamp is None:
            # set timestamp to top of oplog
            timestamp = retry_until_ok(self.get_last_oplog_timestamp)

        self.checkpoint = timestamp
        cursor = self.get_oplog_cursor(timestamp)
        if cursor is not None:
            self.update_checkpoint()

        return cursor
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        dump_set = self.namespace_set

        #no namespaces specified
        if not self.namespace_set:
            db_list = self.main_connection.database_names()
            for database in db_list:
                if database == "config" or database == "local":
                    continue
                coll_list = self.main_connection[database].collection_names()
                for coll in coll_list:
                    if coll.startswith("system"):
                        continue
                    namespace = str(database) + "." + str(coll)
                    dump_set.append(namespace)

        timestamp = util.retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        long_ts = util.bson_ts_to_long(timestamp)

        def docs_to_dump():
            for namespace in dump_set:
                logging.info("dumping collection %s" % namespace)
                database, coll = namespace.split('.', 1)
                target_coll = self.main_connection[database][coll]
                cursor = util.retry_until_ok(target_coll.find)
                for doc in cursor:
                    if not self.running:
                        raise StopIteration
                    doc["ns"] = namespace
                    doc["_ts"] = long_ts
                    yield doc

        try:
            # Bulk upsert if possible
            if self.can_bulk:
                self.doc_manager.bulk_upsert(docs_to_dump())
            else:
                for doc in docs_to_dump():
                    try:
                        self.doc_manager.upsert(self.filter_fields(doc))
                    except errors.OperationFailed:
                        logging.error("Unable to insert %s" % doc)
        except (pymongo.errors.AutoReconnect,
                pymongo.errors.OperationFailure):
            err_msg = "OplogManager: Failed during dump collection"
            effect = "cannot recover!"
            logging.error('%s %s %s' % (err_msg, effect, self.oplog))
            self.running = False
            return None

        return timestamp
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.

        Returns the cursor and the number of documents left in the cursor.
        """
        timestamp = self.read_last_checkpoint()

        if timestamp is None:
            if self.collection_dump:
                # dump collection and update checkpoint
                timestamp = self.dump_collection()
                if timestamp is None:
                    return None, 0
            else:
                # Collection dump disabled:
                # return cursor to beginning of oplog.
                self.checkpoint = self.get_last_oplog_timestamp()
                self.update_checkpoint()
                return self.get_oplog_cursor()

        self.checkpoint = timestamp
        self.update_checkpoint()

        cursor, cursor_len = self.get_oplog_cursor(timestamp)

        if cursor_len == 0:
            # rollback, update checkpoint, and retry
            logging.debug("OplogThread: Initiating rollback from "
                          "get_oplog_cursor")
            self.checkpoint = self.rollback()
            self.update_checkpoint()
            return self.init_cursor()

        # first entry should be last oplog entry processed
        first_oplog_entry = retry_until_ok(lambda: cursor[0])
        cursor_ts_long = util.bson_ts_to_long(first_oplog_entry.get("ts"))
        given_ts_long = util.bson_ts_to_long(timestamp)
        if cursor_ts_long > given_ts_long:
            # first entry in oplog is beyond timestamp, we've fallen behind!
            return None, 0

        retry_until_ok(next, cursor)
        return cursor, cursor_len - 1
    def test_rollback(self):
      """Test behavior during a MongoDB rollback.
      We force a rollback by adding a doc, killing the primary,
      adding another doc, killing the new primary, and then
      restarting both.
      """
      primary_conn = self.repl_set.primary.client()

      self.conn['test']['rollback'].insert({'name': 'paul'})
      result_set = self.conn['test']['rollback'].find_one()
      self.connector.doc_managers[0].upsert({'_id': str(result_set['_id']),'name': 'paul'}, "test.rollback", 1)
      condition1 = lambda: self.conn['test']['rollback'].find(
          {'name': 'paul'}).count() == 1
      assert_soon(condition1)

      self.repl_set.primary.stop(destroy=False)

      new_primary_conn = self.repl_set.secondary.client()

      admin = new_primary_conn['admin']
      assert_soon(lambda: admin.command("isMaster")['ismaster'])
      time.sleep(5)
      retry_until_ok(self.conn.test.rollback.insert, {'name': 'pauline'})
      assert_soon(lambda: self._count() > 0)
      result_set_2 = self.conn['test']['rollback'].find_one({'name': 'pauline'})
      self.connector.doc_managers[0].upsert({'_id': str(result_set_2['_id']),'name': 'pauline'}, "test.rollback", 1)
      result_set_1 = self.neo4j_conn.find_one("rollback", "name", "pauline")
      self.assertNotEqual(result_set_1, None)
      #make sure pauline is there
      if result_set_1['name'] == 'pauline':
        self.assertEqual(result_set_1['_id'], str(result_set_2['_id']))
      self.repl_set.secondary.stop(destroy=False)

      self.repl_set.primary.start()
      while primary_conn['admin'].command("isMaster")['ismaster'] is False:
        time.sleep(1)

      self.repl_set.secondary.start()

      time.sleep(2)
      node = self.neo4j_conn.find_one("rollback")
      self.assertEqual(node['name'], 'paul')
      node = self.neo4j_conn.find_one("rollback", "name", "pauline")
      self.assertNotEqual(node, None)
      find_cursor = retry_until_ok(self.conn['test']['rollback'].find)
      self.assertEqual(retry_until_ok(find_cursor.count), 1)
    def test_rollback(self):
        """Tests rollback. We force a rollback by adding a doc, killing the
        primary, adding another doc, killing the new primary, and then
        restarting both.
        """
        primary_conn = self.repl_set.primary.client()
        self.conn["test"]["test"].insert_one({"name": "paul"})
        condition = (
            lambda: self.conn["test"]["test"].find_one({"name": "paul"}) is not None
        )
        assert_soon(condition)
        assert_soon(lambda: sum(1 for _ in self._search()) == 1)

        self.repl_set.primary.stop(destroy=False)
        new_primary_conn = self.repl_set.secondary.client()
        admin = new_primary_conn["admin"]

        def condition():
            return admin.command("isMaster")["ismaster"]

        assert_soon(lambda: retry_until_ok(condition))

        retry_until_ok(self.conn.test.test.insert_one, {"name": "pauline"})
        assert_soon(lambda: sum(1 for _ in self._search()) == 2)
        result_set_1 = list(self._search())
        result_set_2 = self.conn["test"]["test"].find_one({"name": "pauline"})
        self.assertEqual(len(result_set_1), 2)
        # make sure pauline is there
        for item in result_set_1:
            if item["name"] == "pauline":
                self.assertEqual(item["_id"], result_set_2["_id"])
        self.repl_set.secondary.stop(destroy=False)

        self.repl_set.primary.start()
        assert_soon(lambda: primary_conn["admin"].command("isMaster")["ismaster"])

        self.repl_set.secondary.start()

        time.sleep(2)
        result_set_1 = list(self._search())
        self.assertEqual(len(result_set_1), 1)
        for item in result_set_1:
            self.assertEqual(item["name"], "paul")
        find_cursor = retry_until_ok(self.conn["test"]["test"].find)
        self.assertEqual(retry_until_ok(find_cursor.count), 1)
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        timestamp = retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        long_ts = util.bson_ts_to_long(timestamp)
        # Flag if this oplog thread was cancelled during the collection dump.
        # Use a list to workaround python scoping.
        dump_cancelled = [False]

        def get_all_ns():
            ns_set = []
            gridfs_ns_set = []
            db_list = self.namespace_config.get_included_databases()
            if not db_list:
                # Only use listDatabases when the configured databases are not
                # explicit.
                db_list = retry_until_ok(self.primary_client.database_names)
            for database in db_list:
                if database == "config" or database == "local":
                    continue
                coll_list = retry_until_ok(
                    self.primary_client[database].collection_names)
                for coll in coll_list:
                    # ignore system collections
                    if coll.startswith("system."):
                        continue
                    # ignore gridfs chunks collections
                    if coll.endswith(".chunks"):
                        continue
                    if coll.endswith(".files"):
                        namespace = "%s.%s" % (database, coll)
                        namespace = namespace[:-len(".files")]
                        if self.namespace_config.gridfs_namespace(namespace):
                            gridfs_ns_set.append(namespace)
                    else:
                        namespace = "%s.%s" % (database, coll)
                        if self.namespace_config.map_namespace(namespace):
                            ns_set.append(namespace)
            return ns_set, gridfs_ns_set

        dump_set, gridfs_dump_set = get_all_ns()

        LOG.debug("OplogThread: Dumping set of collections %s " % dump_set)

        def docs_to_dump(from_coll):
            last_id = None
            attempts = 0
            projection = self.namespace_config.projection(from_coll.full_name)
            # Loop to handle possible AutoReconnect
            while attempts < 60:
                if last_id is None:
                    cursor = retry_until_ok(
                        from_coll.find,
                        projection=projection,
                        sort=[("_id", pymongo.ASCENDING)],
                    )
                else:
                    cursor = retry_until_ok(
                        from_coll.find,
                        {"_id": {
                            "$gt": last_id
                        }},
                        projection=projection,
                        sort=[("_id", pymongo.ASCENDING)],
                    )
                try:
                    for doc in cursor:
                        if not self.running:
                            # Thread was joined while performing the
                            # collection dump.
                            dump_cancelled[0] = True
                            raise StopIteration
                        last_id = doc["_id"]
                        yield doc
                    break
                except (pymongo.errors.AutoReconnect,
                        pymongo.errors.OperationFailure):
                    attempts += 1
                    time.sleep(1)

        def upsert_each(dm):
            num_failed = 0
            for namespace in dump_set:
                from_coll = self.get_collection(namespace)
                mapped_ns = self.namespace_config.map_namespace(namespace)
                total_docs = retry_until_ok(from_coll.count)
                num = None
                for num, doc in enumerate(docs_to_dump(from_coll)):
                    try:
                        dm.upsert(doc, mapped_ns, long_ts)
                    except Exception:
                        if self.continue_on_error:
                            LOG.exception("Could not upsert document: %r" %
                                          doc)
                            num_failed += 1
                        else:
                            raise
                    if num % 10000 == 0:
                        LOG.info(
                            "Upserted %d out of approximately %d docs "
                            "from collection '%s'",
                            num + 1,
                            total_docs,
                            namespace,
                        )
                if num is not None:
                    LOG.info(
                        "Upserted %d out of approximately %d docs from "
                        "collection '%s'",
                        num + 1,
                        total_docs,
                        namespace,
                    )
            if num_failed > 0:
                LOG.error("Failed to upsert %d docs" % num_failed)

        def upsert_all(dm):
            try:
                for namespace in dump_set:
                    from_coll = self.get_collection(namespace)
                    total_docs = retry_until_ok(from_coll.count)
                    mapped_ns = self.namespace_config.map_namespace(namespace)
                    LOG.info(
                        "Bulk upserting approximately %d docs from "
                        "collection '%s'",
                        total_docs,
                        namespace,
                    )
                    dm.bulk_upsert(docs_to_dump(from_coll), mapped_ns, long_ts)
            except Exception:
                if self.continue_on_error:
                    LOG.exception("OplogThread: caught exception"
                                  " during bulk upsert, re-upserting"
                                  " documents serially")
                    upsert_each(dm)
                else:
                    raise

        def do_dump(dm, error_queue):
            try:
                LOG.debug("OplogThread: Using bulk upsert function for "
                          "collection dump")
                upsert_all(dm)

                if gridfs_dump_set:
                    LOG.info("OplogThread: dumping GridFS collections: %s",
                             gridfs_dump_set)

                # Dump GridFS files
                for gridfs_ns in gridfs_dump_set:
                    mongo_coll = self.get_collection(gridfs_ns)
                    from_coll = self.get_collection(gridfs_ns + ".files")
                    dest_ns = self.namespace_config.map_namespace(gridfs_ns)
                    for doc in docs_to_dump(from_coll):
                        gridfile = GridFSFile(mongo_coll, doc)
                        dm.insert_file(gridfile, dest_ns, long_ts)
            except Exception:
                # Likely exceptions:
                # pymongo.errors.OperationFailure,
                # mongo_connector.errors.ConnectionFailed
                # mongo_connector.errors.OperationFailed
                error_queue.put(sys.exc_info())

        # Extra threads (if any) that assist with collection dumps
        dumping_threads = []
        # Did the dump succeed for all target systems?
        dump_success = True
        # Holds any exceptions we can't recover from
        errors = queue.Queue()

        if len(self.doc_managers) == 1:
            do_dump(self.doc_managers[0], errors)
        else:
            # Slight performance gain breaking dump into separate
            # threads if > 1 replication target
            for dm in self.doc_managers:
                t = threading.Thread(target=do_dump, args=(dm, errors))
                dumping_threads.append(t)
                t.start()
            # cleanup
            for t in dumping_threads:
                t.join()

        # Print caught exceptions
        try:
            while True:
                LOG.critical("Exception during collection dump",
                             exc_info=errors.get_nowait())
                dump_success = False
        except queue.Empty:
            pass

        if not dump_success:
            err_msg = "OplogThread: Failed during dump collection"
            effect = "cannot recover!"
            LOG.error("%s %s %s" % (err_msg, effect, self.oplog))
            self.running = False
            return None

        if dump_cancelled[0]:
            LOG.warning("Initial collection dump was interrupted. "
                        "Will re-run the collection dump on next startup.")
            return None

        return timestamp
 def commit(self):
     """This function is used to force a refresh/commit.
     """
     retry_until_ok(self.elastic.indices.refresh, index="")
Beispiel #33
0
 def test_retry_until_ok_runtime_error(self):
     """Test retry_until_ok does not mask RuntimeErrors.
     """
     with self.assertRaises(RuntimeError):
         retry_until_ok(err_func, RuntimeError)
     self.assertEqual(err_func.counter, 1)
    def run(self):
        """Start the oplog worker.
        """
        logging.debug("OplogThread: Run thread started")
        while self.running is True:
            logging.debug("OplogThread: Getting cursor")
            cursor = self.init_cursor()
            logging.debug("OplogThread: Got the cursor, go go go!")

            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                logging.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            #The only entry is the last one we processed
            if cursor is None or util.retry_until_ok(cursor.count) == 1:
                logging.debug("OplogThread: Last entry is the one we "
                              "already processed.  Up to date.  Sleeping.")
                time.sleep(1)
                continue

            last_ts = None
            err = False
            remove_inc = 0
            upsert_inc = 0
            try:
                logging.debug("OplogThread: about to process new oplog "
                              "entries")
                while cursor.alive and self.running:
                    logging.debug("OplogThread: Cursor is still"
                                  " alive and thread is still running.")
                    for n, entry in enumerate(cursor):

                        logging.debug("OplogThread: Iterating through cursor,"
                                      " document number in this cursor is %d" %
                                      n)
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        # Don't replicate entries resulting from chunk moves
                        if entry.get("fromMigrate"):
                            continue

                        # Take fields out of the oplog entry that
                        # shouldn't be replicated. This may nullify
                        # the document if there's nothing to do.
                        if not self.filter_oplog_entry(entry):
                            continue

                        #sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']

                        # use namespace mapping if one exists
                        ns = self.dest_mapping.get(entry['ns'], ns)

                        for docman in self.doc_managers:
                            try:
                                logging.debug(
                                    "OplogThread: Operation for this "
                                    "entry is %s" % str(operation))

                                # Remove
                                if operation == 'd':
                                    entry['_id'] = entry['o']['_id']
                                    docman.remove(entry)
                                    remove_inc += 1
                                # Insert
                                elif operation == 'i':  # Insert
                                    # Retrieve inserted document from
                                    # 'o' field in oplog record
                                    doc = entry.get('o')
                                    # Extract timestamp and namespace
                                    doc['_ts'] = util.bson_ts_to_long(
                                        entry['ts'])
                                    doc['ns'] = ns
                                    docman.upsert(doc)
                                    upsert_inc += 1
                                # Update
                                elif operation == 'u':
                                    doc = {
                                        "_id": entry['o2']['_id'],
                                        "_ts":
                                        util.bson_ts_to_long(entry['ts']),
                                        "ns": ns
                                    }
                                    # 'o' field contains the update spec
                                    docman.update(doc, entry.get('o', {}))
                                    update_inc += 1
                            except errors.OperationFailed:
                                logging.exception(
                                    "Unable to process oplog document %r" %
                                    entry)
                            except errors.ConnectionFailed:
                                logging.exception(
                                    "Connection failed while processing oplog "
                                    "document %r" % entry)

                        if (remove_inc + upsert_inc + update_inc) % 1000 == 0:
                            logging.debug(
                                "OplogThread: Documents removed: %d, "
                                "inserted: %d, updated: %d so far" %
                                (remove_inc, upsert_inc, update_inc))

                        logging.debug("OplogThread: Doc is processed.")

                        last_ts = entry['ts']

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1 and last_ts is not None:
                            self.checkpoint = last_ts
                            self.update_checkpoint()

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        logging.debug("OplogThread: updating checkpoint after"
                                      "processing new oplog entries")
                        self.checkpoint = last_ts
                        self.update_checkpoint()

            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError):
                logging.exception("Cursor closed due to an exception. "
                                  "Will attempt to reconnect.")
                err = True

            if err is True and self.auth_key is not None:
                self.primary_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                self.main_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                err = False

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                logging.debug("OplogThread: updating checkpoint after an "
                              "Exception, cursor closing, or join() on this"
                              "thread.")
                self.checkpoint = last_ts
                self.update_checkpoint()

            logging.debug("OplogThread: Sleeping. Documents removed: %d, "
                          "upserted: %d, updated: %d" %
                          (remove_inc, upsert_inc, update_inc))
            time.sleep(2)
Beispiel #35
0
    def test_rollback(self):
        """Test behavior during a MongoDB rollback.

        We force a rollback by adding a doc, killing the primary,
        adding another doc, killing the new primary, and then
        restarting both.
        """
        primary_conn = self.repl_set.primary.client()

        # This doc can be picked up in the collection dump
        self.conn["test"]["test"].insert_one({"name": "paul"})
        condition1 = (lambda: self.conn["test"]["test"].find({
            "name": "paul"
        }).count() == 1)

        def condition2():
            return self._count() == 1

        assert_soon(condition1)
        assert_soon(condition2)

        # This doc is definitely not picked up by collection dump
        self.conn["test"]["test"].insert_one({"name": "pauly"})

        self.repl_set.primary.stop(destroy=False)

        new_primary_conn = self.repl_set.secondary.client()

        admin = new_primary_conn["admin"]
        assert_soon(lambda: admin.command("isMaster")["ismaster"])
        time.sleep(5)
        retry_until_ok(self.conn.test.test.insert_one, {"name": "pauline"})
        assert_soon(lambda: self._count() == 3)
        result_set_1 = list(self._search())
        result_set_2 = self.conn["test"]["test"].find_one({"name": "pauline"})
        self.assertEqual(len(result_set_1), 3)
        # make sure pauline is there
        for item in result_set_1:
            if item["name"] == "pauline":
                self.assertEqual(item["_id"], str(result_set_2["_id"]))
        self.repl_set.secondary.stop(destroy=False)

        self.repl_set.primary.start()
        while primary_conn["admin"].command("isMaster")["ismaster"] is False:
            time.sleep(1)

        self.repl_set.secondary.start()

        time.sleep(2)
        result_set_1 = list(self._search())
        self.assertEqual(len(result_set_1), 2)

        if result_set_1[0]["name"] == "paul":
            self.assertEqual(result_set_1[1]["name"], "pauly")
        elif result_set_1[0]["name"] == "pauly":
            self.assertEqual(result_set_1[1]["name"], "paul")
        else:
            self.assertTrue(0, "Unknown document retrieved")

        find_cursor = retry_until_ok(self.conn["test"]["test"].find)
        self.assertEqual(retry_until_ok(find_cursor.count), 2)
    def run(self):
        """Start the oplog worker.
        """
        ReplicationLagLogger(self, 30).start()
        LOG.debug("OplogThread: Run thread started")
        while self.running is True:
            LOG.debug("OplogThread: Getting cursor")
            cursor, cursor_empty = retry_until_ok(self.init_cursor)
            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                LOG.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            if cursor_empty:
                LOG.debug("OplogThread: Last entry is the one we "
                          "already processed.  Up to date.  Sleeping.")
                time.sleep(1)
                continue

            last_ts = None
            remove_inc = 0
            upsert_inc = 0
            update_inc = 0
            try:
                LOG.debug("OplogThread: about to process new oplog entries")
                while cursor.alive and self.running:
                    LOG.debug("OplogThread: Cursor is still"
                              " alive and thread is still running.")
                    for n, entry in enumerate(cursor):
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        LOG.debug("OplogThread: Iterating through cursor,"
                                  " document number in this cursor is %d"
                                  % n)

                        skip, is_gridfs_file = self._should_skip_entry(entry)
                        if skip:
                            # update the last_ts on skipped entries to ensure
                            # our checkpoint does not fall off the oplog. This
                            # also prevents reprocessing skipped entries.
                            last_ts = entry['ts']
                            continue

                        # Sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']
                        self.dump_to_file(entry)


            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError):
                LOG.exception(
                    "Cursor closed due to an exception. "
                    "Will attempt to reconnect.")

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                LOG.debug("OplogThread: updating checkpoint after an "
                          "Exception, cursor closing, or join() on this"
                          "thread.")
                self.update_checkpoint(last_ts)

            LOG.debug("OplogThread: Sleeping. Documents removed: %d, "
                      "upserted: %d, updated: %d"
                      % (remove_inc, upsert_inc, update_inc))
            time.sleep(2)
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        dump_set = self.namespace_set or []
        logging.debug("OplogThread: Dumping set of collections %s " % dump_set)

        #no namespaces specified
        if not self.namespace_set:
            db_list = retry_until_ok(self.main_connection.database_names)
            for database in db_list:
                if database in self.db_to_skip:  # skip "config" , "local" and any other db specified by user to skip
                    continue
                coll_list = retry_until_ok(
                    self.main_connection[database].collection_names)
                for coll in coll_list:
                    if coll.startswith("system"):
                        continue
                    namespace = "%s.%s" % (database, coll)
                    dump_set.append(namespace)

        timestamp = util.retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        long_ts = util.bson_ts_to_long(timestamp)

        def docs_to_dump():
            for namespace in dump_set:
                logging.info("OplogThread: dumping collection %s" % namespace)
                database, coll = namespace.split('.', 1)
                last_id = None
                attempts = 0
                sleep_sec = 1
                # Loop to handle possible AutoReconnect
                while attempts < 60:
                    target_coll = self.main_connection[database][coll]
                    if not last_id:
                        cursor = util.retry_until_ok(target_coll.find,
                                                     fields=self._fields,
                                                     sort=[("_id",
                                                            pymongo.ASCENDING)
                                                           ])
                    else:
                        cursor = util.retry_until_ok(target_coll.find,
                                                     {"_id": {
                                                         "$gt": last_id
                                                     }},
                                                     fields=self._fields,
                                                     sort=[("_id",
                                                            pymongo.ASCENDING)
                                                           ])
                    try:
                        for doc in cursor:
                            if not self.running:
                                raise StopIteration
                            doc["ns"] = self.dest_mapping.get(
                                namespace, namespace)
                            doc["_ts"] = long_ts
                            last_id = doc["_id"]
                            yield doc
                        break
                    except pymongo.errors.AutoReconnect:
                        attempts += 1
                        if (attempts % 10 == 0):
                            sleep_sec <<= 1  ## maybe we want some exponential backoff ???
                        time.sleep(sleep_sec)

        # Extra threads (if any) that assist with collection dumps
        dumping_threads = []
        # Did the dump succeed for all target systems?
        dump_success = True
        # Holds any exceptions we can't recover from
        errors = queue.Queue()
        try:
            for dm in self.doc_managers:
                # Bulk upsert if possible
                if hasattr(dm, "bulk_upsert"):
                    logging.debug("OplogThread: Using bulk upsert function for"
                                  "collection dump")
                    # Slight performance gain breaking dump into separate
                    # threads, only if > 1 replication target
                    if len(self.doc_managers) == 1:
                        dm.bulk_upsert(docs_to_dump())
                    else:

                        def do_dump(error_queue):
                            all_docs = docs_to_dump()
                            try:
                                dm.bulk_upsert(all_docs)
                            except Exception:
                                # Likely exceptions:
                                # pymongo.errors.OperationFailure,
                                # mongo_connector.errors.ConnectionFailed
                                # mongo_connector.errors.OperationFailed
                                error_queue.put(sys.exc_info())

                        t = threading.Thread(target=do_dump, args=(errors, ))
                        dumping_threads.append(t)
                        t.start()
                else:
                    logging.debug("OplogThread: DocManager %s has not"
                                  "bulk_upsert method.  Upserting documents "
                                  "serially for collection dump." % str(dm))
                    num = 0
                    for num, doc in enumerate(docs_to_dump()):
                        if num % 10000 == 0:
                            logging.debug("Upserted %d docs." % num)
                        dm.upsert(doc)
                    logging.debug("Upserted %d docs" % num)

            # cleanup
            for t in dumping_threads:
                t.join()

        except Exception:
            # See "likely exceptions" comment above
            errors.put(sys.exc_info())

        # Print caught exceptions
        try:
            while True:
                klass, value, trace = errors.get_nowait()
                dump_success = False
                traceback.print_exception(klass, value, trace)
        except queue.Empty:
            pass

        if not dump_success:
            err_msg = "OplogThread: Failed during dump collection"
            effect = "cannot recover!"
            logging.error('%s %s %s' % (err_msg, effect, self.oplog))
            self.running = False
            return None

        return timestamp
Beispiel #38
0
    def rollback(self):
        """Rollback target system to consistent state.

        The strategy is to find the latest timestamp in the target system and
        the largest timestamp in the oplog less than the latest target system
        timestamp. This defines the rollback window and we just roll these
        back until the oplog and target system are in consistent states.
        """
        # Find the most recently inserted document in each target system
        last_docs = []
        for dm in self.doc_managers:
            dm.commit()
            last_docs.append(dm.get_last_doc())

        # Of these documents, which is the most recent?
        last_inserted_doc = max(last_docs,
                                key=lambda x: x["_ts"] if x else float("-inf"))

        # Nothing has been replicated. No need to rollback target systems
        if last_inserted_doc is None:
            return None

        # Find the oplog entry that touched the most recent document.
        # We'll use this to figure where to pick up the oplog later.
        target_ts = util.long_to_bson_ts(last_inserted_doc['_ts'])
        last_oplog_entry = self.oplog.find_one({'ts': {
            '$lte': target_ts
        }},
                                               sort=[('$natural',
                                                      pymongo.DESCENDING)])

        # The oplog entry for the most recent document doesn't exist anymore.
        # If we've fallen behind in the oplog, this will be caught later
        if last_oplog_entry is None:
            return None

        # rollback_cutoff_ts happened *before* the rollback
        rollback_cutoff_ts = last_oplog_entry['ts']
        start_ts = util.bson_ts_to_long(rollback_cutoff_ts)
        # timestamp of the most recent document on any target system
        end_ts = last_inserted_doc['_ts']

        for dm in self.doc_managers:
            rollback_set = {}  # this is a dictionary of ns:list of docs

            # group potentially conflicted documents by namespace
            for doc in dm.search(start_ts, end_ts):
                if doc['ns'] in rollback_set:
                    rollback_set[doc['ns']].append(doc)
                else:
                    rollback_set[doc['ns']] = [doc]

            # retrieve these documents from MongoDB, either updating
            # or removing them in each target system
            for namespace, doc_list in rollback_set.items():
                # Get the original namespace
                original_namespace = namespace
                for source_name, dest_name in self.dest_mapping.items():
                    if dest_name == namespace:
                        original_namespace = source_name

                database, coll = original_namespace.split('.', 1)
                obj_id = bson.objectid.ObjectId
                bson_obj_id_list = [obj_id(doc['_id']) for doc in doc_list]

                to_update = util.retry_until_ok(
                    self.main_connection[database][coll].find,
                    {'_id': {
                        '$in': bson_obj_id_list
                    }})
                #doc list are docs in target system, to_update are
                #docs in mongo
                doc_hash = {}  # hash by _id
                for doc in doc_list:
                    doc_hash[bson.objectid.ObjectId(doc['_id'])] = doc

                to_index = []

                def collect_existing_docs():
                    for doc in to_update:
                        if doc['_id'] in doc_hash:
                            del doc_hash[doc['_id']]
                            to_index.append(doc)

                retry_until_ok(collect_existing_docs)

                #delete the inconsistent documents
                for doc in doc_hash.values():
                    dm.remove(doc)

                #insert the ones from mongo
                for doc in to_index:
                    doc['_ts'] = util.bson_ts_to_long(rollback_cutoff_ts)
                    doc['ns'] = self.dest_mapping.get(namespace, namespace)
                    try:
                        dm.upsert(self.filter_fields(doc))
                    except errors.OperationFailed:
                        logging.error("Unable to insert %s" % (doc))

        return rollback_cutoff_ts
    def test_single_target(self):
        """Test with a single replication target"""

        self.opman.start()

        # Insert first document with primary up
        self.main_conn["test"]["mc"].insert_one({"i": 0})
        self.assertEqual(self.primary_conn["test"]["mc"].find().count(), 1)

        # Make sure the insert is replicated
        secondary = self.secondary_conn
        assert_soon(
            lambda: secondary["test"]["mc"].count() == 1,
            "first write didn't replicate to secondary",
        )

        # Kill the primary
        self.repl_set.primary.stop(destroy=False)

        # Wait for the secondary to be promoted
        assert_soon(lambda: secondary["admin"].command("isMaster")["ismaster"])

        # Insert another document. This will be rolled back later
        retry_until_ok(self.main_conn["test"]["mc"].insert_one, {"i": 1})
        self.assertEqual(secondary["test"]["mc"].count(), 2)

        # Wait for replication to doc manager
        assert_soon(
            lambda: len(self.opman.doc_managers[0]._search()) == 2,
            "not all writes were replicated to doc manager",
        )

        # Kill the new primary
        self.repl_set.secondary.stop(destroy=False)

        # Start both servers back up
        self.repl_set.primary.start()
        primary_admin = self.primary_conn["admin"]
        assert_soon(
            lambda: primary_admin.command("isMaster")["ismaster"],
            "restarted primary never resumed primary status",
        )
        self.repl_set.secondary.start()
        assert_soon(
            lambda: retry_until_ok(secondary.admin.command, "replSetGetStatus")
            ["myState"] == 2,
            "restarted secondary never resumed secondary status",
        )
        assert_soon(
            lambda: retry_until_ok(self.main_conn.test.mc.find().count) > 0,
            "documents not found after primary/secondary restarted",
        )

        # Only first document should exist in MongoDB
        self.assertEqual(self.main_conn["test"]["mc"].count(), 1)
        self.assertEqual(self.main_conn["test"]["mc"].find_one()["i"], 0)

        # Same case should hold for the doc manager
        doc_manager = self.opman.doc_managers[0]
        assert_soon(
            lambda: len(doc_manager._search()) == 1,
            "documents never rolled back in doc manager.",
        )
        self.assertEqual(doc_manager._search()[0]["i"], 0)

        # cleanup
        self.opman.join()
Beispiel #40
0
    def run(self):
        """Start the oplog worker.
        """
        while self.running is True:
            cursor = self.init_cursor()

            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogManager: Last entry no longer in oplog"
                effect = "cannot recover!"
                logging.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            #The only entry is the last one we processed
            if cursor is None or util.retry_until_ok(cursor.count) == 1:
                time.sleep(1)
                continue

            last_ts = None
            err = False
            try:
                while cursor.alive and self.running:
                    for n, entry in enumerate(cursor):
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        #sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']

                        #check if ns is excluded or not.
                        #also ensure non-empty namespace set.
                        if (ns not in self.namespace_set
                                and self.namespace_set):
                            continue

                        # use namespace mapping if one exists
                        ns = self.dest_mapping.get(entry['ns'], ns)

                        #delete
                        try:
                            if operation == 'd':
                                entry['_id'] = entry['o']['_id']
                                for dm in self.doc_managers:
                                    dm.remove(entry)
                            #insert/update. They are equal because of lack
                            #of support for partial update
                            elif operation == 'i' or operation == 'u':
                                doc = self.retrieve_doc(entry)
                                if doc is not None:
                                    doc['_ts'] = util.bson_ts_to_long(
                                        entry['ts'])
                                    doc['ns'] = ns
                                    for dm in self.doc_managers:
                                        dm.upsert(self.filter_fields(doc))
                        except errors.OperationFailed:
                            logging.error(
                                "Unable to %s doc with id %s" %
                                ("delete" if operation == "d" else "upsert",
                                 str(entry['_id'])))
                        except errors.ConnectionFailed:
                            logging.error(
                                "Network error while trying to %s %s" %
                                ("delete" if operation == "d" else "upsert",
                                 str(entry['_id'])))

                        last_ts = entry['ts']

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1 and last_ts is not None:
                            self.checkpoint = last_ts
                            self.update_checkpoint()

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        self.checkpoint = last_ts
                        self.update_checkpoint()

            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure):
                err = True

            if err is True and self.auth_key is not None:
                self.primary_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                self.main_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                err = False

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                self.checkpoint = last_ts
                self.update_checkpoint()

            time.sleep(2)
Beispiel #41
0
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        dump_set = self.namespace_set

        #no namespaces specified
        if not self.namespace_set:
            db_list = self.main_connection.database_names()
            for database in db_list:
                if database == "config" or database == "local":
                    continue
                coll_list = self.main_connection[database].collection_names()
                for coll in coll_list:
                    if coll.startswith("system"):
                        continue
                    namespace = str(database) + "." + str(coll)
                    dump_set.append(namespace)

        timestamp = util.retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        long_ts = util.bson_ts_to_long(timestamp)

        def docs_to_dump():
            for namespace in dump_set:
                logging.info("dumping collection %s" % namespace)
                database, coll = namespace.split('.', 1)
                last_id = None
                attempts = 0

                # Loop to handle possible AutoReconnect
                while attempts < 60:
                    target_coll = self.main_connection[database][coll]
                    if not last_id:
                        cursor = util.retry_until_ok(target_coll.find,
                                                     sort=[("_id",
                                                            pymongo.ASCENDING)
                                                           ])
                    else:
                        cursor = util.retry_until_ok(target_coll.find,
                                                     {"_id": {
                                                         "$gt": last_id
                                                     }},
                                                     sort=[("_id",
                                                            pymongo.ASCENDING)
                                                           ])
                    try:
                        for doc in cursor:
                            if not self.running:
                                raise StopIteration
                            doc["ns"] = self.dest_mapping.get(
                                namespace, namespace)
                            doc["_ts"] = long_ts
                            last_id = doc["_id"]
                            yield doc
                        break
                    except pymongo.errors.AutoReconnect:
                        attempts += 1
                        time.sleep(1)

        # Extra threads (if any) that assist with collection dumps
        dumping_threads = []
        # Did the dump succeed for all target systems?
        dump_success = True
        # Holds any exceptions we can't recover from
        errors = queue.Queue()
        try:
            for dm in self.doc_managers:
                # Bulk upsert if possible
                if hasattr(dm, "bulk_upsert"):
                    # Slight performance gain breaking dump into separate
                    # threads, only if > 1 replication target
                    if len(self.doc_managers) == 1:
                        dm.bulk_upsert(docs_to_dump())
                    else:

                        def do_dump(error_queue):
                            all_docs = docs_to_dump()
                            try:
                                dm.bulk_upsert(all_docs)
                            except Exception:
                                # Likely exceptions:
                                # pymongo.errors.OperationFailure,
                                # mongo_connector.errors.ConnectionFailed
                                # mongo_connector.errors.OperationFailed
                                error_queue.put(sys.exc_info())

                        t = threading.Thread(target=do_dump, args=(errors, ))
                        dumping_threads.append(t)
                        t.start()
                else:
                    for doc in docs_to_dump():
                        dm.upsert(self.filter_fields(doc))

            # cleanup
            for t in dumping_threads:
                t.join()

        except Exception:
            # See "likely exceptions" comment above
            errors.put(sys.exc_info())

        # Print caught exceptions
        try:
            while True:
                klass, value, trace = errors.get_nowait()
                dump_success = False
                traceback.print_exception(klass, value, trace)
        except queue.Empty:
            pass

        if not dump_success:
            err_msg = "OplogManager: Failed during dump collection"
            effect = "cannot recover!"
            logging.error('%s %s %s' % (err_msg, effect, self.oplog))
            self.running = False
            return None

        return timestamp
Beispiel #42
0
    def rollback(self):
        """Rollback target system to consistent state.

        The strategy is to find the latest timestamp in the target system and
        the largest timestamp in the oplog less than the latest target system
        timestamp. This defines the rollback window and we just roll these
        back until the oplog and target system are in consistent states.
        """
        # Find the most recently inserted document in each target system
        LOG.debug("OplogThread: Initiating rollback sequence to bring "
                  "system into a consistent state.")
        last_docs = []
        for dm in self.doc_managers:
            dm.commit()
            last_docs.append(dm.get_last_doc())

        # Of these documents, which is the most recent?
        last_inserted_doc = max(last_docs,
                                key=lambda x: x["_ts"] if x else float("-inf"))

        # Nothing has been replicated. No need to rollback target systems
        if last_inserted_doc is None:
            return None

        # Find the oplog entry that touched the most recent document.
        # We'll use this to figure where to pick up the oplog later.
        target_ts = util.long_to_bson_ts(last_inserted_doc['_ts'])
        last_oplog_entry = util.retry_until_ok(self.oplog.find_one,
                                               {'ts': {
                                                   '$lte': target_ts
                                               }},
                                               sort=[('$natural',
                                                      pymongo.DESCENDING)])

        LOG.debug("OplogThread: last oplog entry is %s" %
                  str(last_oplog_entry))

        # The oplog entry for the most recent document doesn't exist anymore.
        # If we've fallen behind in the oplog, this will be caught later
        if last_oplog_entry is None:
            return None

        # rollback_cutoff_ts happened *before* the rollback
        rollback_cutoff_ts = last_oplog_entry['ts']
        start_ts = util.bson_ts_to_long(rollback_cutoff_ts)
        # timestamp of the most recent document on any target system
        end_ts = last_inserted_doc['_ts']

        for dm in self.doc_managers:
            rollback_set = {}  # this is a dictionary of ns:list of docs

            # group potentially conflicted documents by namespace
            for doc in dm.search(start_ts, end_ts):
                if doc['ns'] in rollback_set:
                    rollback_set[doc['ns']].append(doc)
                else:
                    rollback_set[doc['ns']] = [doc]

            # retrieve these documents from MongoDB, either updating
            # or removing them in each target system
            for namespace, doc_list in rollback_set.items():
                # Get the original namespace
                original_namespace = self.dest_mapping_stru.get_key(namespace)
                if not original_namespace:
                    original_namespace = namespace

                database, coll = original_namespace.split('.', 1)
                obj_id = bson.objectid.ObjectId
                bson_obj_id_list = [obj_id(doc['_id']) for doc in doc_list]

                # Use connection to whole cluster if in sharded environment.
                client = self.mongos_client or self.primary_client
                to_update = util.retry_until_ok(
                    client[database][coll].find,
                    {'_id': {
                        '$in': bson_obj_id_list
                    }},
                    projection=self._projection)
                # Doc list are docs in target system, to_update are
                # Docs in mongo
                doc_hash = {}  # Hash by _id
                for doc in doc_list:
                    doc_hash[bson.objectid.ObjectId(doc['_id'])] = doc

                to_index = []

                def collect_existing_docs():
                    for doc in to_update:
                        if doc['_id'] in doc_hash:
                            del doc_hash[doc['_id']]
                            to_index.append(doc)

                retry_until_ok(collect_existing_docs)

                # Delete the inconsistent documents
                LOG.debug("OplogThread: Rollback, removing inconsistent "
                          "docs.")
                remov_inc = 0
                for document_id in doc_hash:
                    try:
                        dm.remove(document_id, namespace,
                                  util.bson_ts_to_long(rollback_cutoff_ts))
                        remov_inc += 1
                        LOG.debug("OplogThread: Rollback, removed %r " % doc)
                    except errors.OperationFailed:
                        LOG.warning(
                            "Could not delete document during rollback: %r "
                            "This can happen if this document was already "
                            "removed by another rollback happening at the "
                            "same time." % doc)

                LOG.debug("OplogThread: Rollback, removed %d docs." %
                          remov_inc)

                # Insert the ones from mongo
                LOG.debug("OplogThread: Rollback, inserting documents "
                          "from mongo.")
                insert_inc = 0
                fail_insert_inc = 0
                for doc in to_index:
                    try:
                        insert_inc += 1
                        dm.upsert(
                            doc,
                            self.dest_mapping_stru.get(namespace, namespace),
                            util.bson_ts_to_long(rollback_cutoff_ts))
                    except errors.OperationFailed:
                        fail_insert_inc += 1
                        LOG.exception("OplogThread: Rollback, Unable to "
                                      "insert %r" % doc)

        LOG.debug("OplogThread: Rollback, Successfully inserted %d "
                  " documents and failed to insert %d"
                  " documents.  Returning a rollback cutoff time of %s " %
                  (insert_inc, fail_insert_inc, str(rollback_cutoff_ts)))

        return rollback_cutoff_ts
Beispiel #43
0
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.

        Returns the cursor and True if the cursor is empty.
        """
        timestamp = self.read_last_checkpoint()

        if timestamp is None:
            if self.collection_dump:
                # dump collection and update checkpoint
                timestamp = self.dump_collection()
                if timestamp is None:
                    return None, True
            else:
                # Collection dump disabled:
                # return cursor to beginning of oplog.
                cursor = self.get_oplog_cursor()
                self.checkpoint = self.get_last_oplog_timestamp()
                self.update_checkpoint()
                return cursor, retry_until_ok(self._cursor_empty, cursor)

        self.checkpoint = timestamp
        self.update_checkpoint()

        for i in range(60):
            cursor = self.get_oplog_cursor(timestamp)
            cursor_empty = retry_until_ok(self._cursor_empty, cursor)

            if cursor_empty:
                # rollback, update checkpoint, and retry
                LOG.debug("OplogThread: Initiating rollback from "
                          "get_oplog_cursor")
                self.checkpoint = self.rollback()
                self.update_checkpoint()
                return self.init_cursor()

            # try to get the first oplog entry
            try:
                first_oplog_entry = retry_until_ok(next, cursor)
            except StopIteration:
                # It's possible for the cursor to become invalid
                # between the next(cursor) call and now
                time.sleep(1)
                continue

            oldest_ts_long = util.bson_ts_to_long(
                self.get_oldest_oplog_timestamp())
            checkpoint_ts_long = util.bson_ts_to_long(timestamp)
            if checkpoint_ts_long < oldest_ts_long:
                # We've fallen behind, the checkpoint has fallen off the oplog
                return None, True

            cursor_ts_long = util.bson_ts_to_long(first_oplog_entry["ts"])
            if cursor_ts_long > checkpoint_ts_long:
                # The checkpoint is not present in this oplog and the oplog
                # did not rollover. This means that we connected to a new
                # primary which did not replicate the checkpoint and which has
                # new changes in its oplog for us to process.
                # rollback, update checkpoint, and retry
                LOG.debug("OplogThread: Initiating rollback from "
                          "get_oplog_cursor: new oplog entries found but "
                          "checkpoint is not present")
                self.checkpoint = self.rollback()
                self.update_checkpoint()
                return self.init_cursor()

            # first entry has been consumed
            return cursor, cursor_empty

        else:
            raise errors.MongoConnectorError(
                "Could not initialize oplog cursor.")
Beispiel #44
0
    def test_many_targets(self):
        """Test with several replication targets"""

        # OplogThread has multiple doc managers
        doc_managers = [DocManager(), DocManager(), DocManager()]
        self.opman.doc_managers = doc_managers

        self.opman.start()

        # Insert a document into each namespace
        self.main_conn["test"]["mc"].insert_one({"i": 0})
        self.assertEqual(self.primary_conn["test"]["mc"].count(), 1)

        # Make sure the insert is replicated
        secondary = self.secondary_conn
        assert_soon(lambda: secondary["test"]["mc"].count() == 1,
                    "first write didn't replicate to secondary")

        # Kill the primary
        self.repl_set.primary.stop(destroy=False)

        # Wait for the secondary to be promoted
        assert_soon(lambda: secondary.admin.command("isMaster")['ismaster'],
                    'secondary was never promoted')

        # Insert more documents. This will be rolled back later
        # Some of these documents will be manually removed from
        # certain doc managers, to emulate the effect of certain
        # target systems being ahead/behind others
        secondary_ids = []
        for i in range(1, 10):
            secondary_ids.append(
                retry_until_ok(self.main_conn["test"]["mc"].insert_one, {
                    "i": i
                }).inserted_id)
        self.assertEqual(self.secondary_conn["test"]["mc"].count(), 10)

        # Wait for replication to the doc managers
        def docmans_done():
            for dm in self.opman.doc_managers:
                if len(dm._search()) != 10:
                    return False
            return True

        assert_soon(docmans_done,
                    "not all writes were replicated to doc managers")

        # Remove some documents from the doc managers to simulate
        # uneven replication
        ts = self.opman.doc_managers[0].get_last_doc()['_ts']
        for id in secondary_ids[8:]:
            self.opman.doc_managers[1].remove(id, 'test.mc', ts)
        for id in secondary_ids[2:]:
            self.opman.doc_managers[2].remove(id, 'test.mc', ts)

        # Kill the new primary
        self.repl_set.secondary.stop(destroy=False)

        # Start both servers back up
        self.repl_set.primary.start()
        primary_admin = self.primary_conn["admin"]
        assert_soon(lambda: primary_admin.command("isMaster")['ismaster'],
                    'restarted primary never resumed primary status')
        self.repl_set.secondary.start()
        assert_soon(
            lambda: retry_until_ok(secondary.admin.command, 'replSetGetStatus'
                                   )['myState'] == 2,
            "restarted secondary never resumed secondary status")
        assert_soon(
            lambda: retry_until_ok(self.primary_conn.test.mc.find().count) > 0,
            "documents not found after primary/secondary restarted")

        # Only first document should exist in MongoDB
        self.assertEqual(self.primary_conn["test"]["mc"].count(), 1)
        self.assertEqual(self.primary_conn["test"]["mc"].find_one()["i"], 0)

        # Give OplogThread some time to catch up
        time.sleep(10)

        # Same case should hold for the doc managers
        for dm in self.opman.doc_managers:
            self.assertEqual(len(dm._search()), 1)
            self.assertEqual(dm._search()[0]["i"], 0)

        self.opman.join()
Beispiel #45
0
    def test_retry_until_ok(self):
        """Test retry_until_ok
        """

        self.assertTrue(retry_until_ok(err_func))
        self.assertEqual(err_func.counter, 3)
    def dump_collection(self):
        """Dumps collection into the target system.

        This method is called when we're initializing the cursor and have no
        configs i.e. when we're starting for the first time.
        """

        dump_set = self.namespace_set or []
        LOG.debug("OplogThread: Dumping set of collections %s " % dump_set)

        #no namespaces specified
        if not self.namespace_set:
            db_list = retry_until_ok(self.primary_client.database_names)
            for database in db_list:
                if database == "config" or database == "local":
                    continue
                coll_list = retry_until_ok(
                    self.primary_client[database].collection_names)
                for coll in coll_list:
                    # ignore system collections
                    if coll.startswith("system."):
                        continue
                    # ignore gridfs collections
                    if coll.endswith(".files") or coll.endswith(".chunks"):
                        continue
                    namespace = "%s.%s" % (database, coll)
                    dump_set.append(namespace)

        timestamp = util.retry_until_ok(self.get_last_oplog_timestamp)
        if timestamp is None:
            return None
        long_ts = util.bson_ts_to_long(timestamp)

        def docs_to_dump(namespace):
            database, coll = namespace.split('.', 1)
            last_id = None
            attempts = 0

            # Loop to handle possible AutoReconnect
            while attempts < 60:
                target_coll = self.primary_client[database][coll]
                if not last_id:
                    cursor = util.retry_until_ok(target_coll.find,
                                                 fields=self._fields,
                                                 sort=[("_id",
                                                        pymongo.ASCENDING)])
                else:
                    cursor = util.retry_until_ok(target_coll.find,
                                                 {"_id": {
                                                     "$gt": last_id
                                                 }},
                                                 fields=self._fields,
                                                 sort=[("_id",
                                                        pymongo.ASCENDING)])
                try:
                    for doc in cursor:
                        if not self.running:
                            raise StopIteration
                        last_id = doc["_id"]
                        yield doc
                    break
                except (pymongo.errors.AutoReconnect,
                        pymongo.errors.OperationFailure):
                    attempts += 1
                    time.sleep(1)

        def upsert_each(dm):
            num_inserted = 0
            num_failed = 0
            for namespace in dump_set:
                for num, doc in enumerate(docs_to_dump(namespace)):
                    if num % 10000 == 0:
                        LOG.debug("Upserted %d docs." % num)
                    try:
                        mapped_ns = self.dest_mapping.get(namespace, namespace)
                        dm.upsert(doc, mapped_ns, long_ts)
                        num_inserted += 1
                    except Exception:
                        if self.continue_on_error:
                            LOG.exception("Could not upsert document: %r" %
                                          doc)
                            num_failed += 1
                        else:
                            raise
            LOG.debug("Upserted %d docs" % num_inserted)
            if num_failed > 0:
                LOG.error("Failed to upsert %d docs" % num_failed)

        def upsert_all(dm):
            try:
                for namespace in dump_set:
                    mapped_ns = self.dest_mapping.get(namespace, namespace)
                    dm.bulk_upsert(docs_to_dump(namespace), mapped_ns, long_ts)
            except Exception:
                if self.continue_on_error:
                    LOG.exception("OplogThread: caught exception"
                                  " during bulk upsert, re-upserting"
                                  " documents serially")
                    upsert_each(dm)
                else:
                    raise

        def do_dump(dm, error_queue):
            try:
                # Dump the documents, bulk upsert if possible
                if hasattr(dm, "bulk_upsert"):
                    LOG.debug("OplogThread: Using bulk upsert function for "
                              "collection dump")
                    upsert_all(dm)
                else:
                    LOG.debug("OplogThread: DocManager %s has no "
                              "bulk_upsert method.  Upserting documents "
                              "serially for collection dump." % str(dm))
                    upsert_each(dm)

                # Dump GridFS files
                for gridfs_ns in self.gridfs_set:
                    db, coll = gridfs_ns.split('.', 1)
                    mongo_coll = self.primary_client[db][coll]
                    dest_ns = self.dest_mapping.get(gridfs_ns, gridfs_ns)
                    for doc in docs_to_dump(gridfs_ns + '.files'):
                        gridfile = GridFSFile(mongo_coll, doc)
                        dm.insert_file(gridfile, dest_ns, long_ts)
            except:
                # Likely exceptions:
                # pymongo.errors.OperationFailure,
                # mongo_connector.errors.ConnectionFailed
                # mongo_connector.errors.OperationFailed
                error_queue.put(sys.exc_info())

        # Extra threads (if any) that assist with collection dumps
        dumping_threads = []
        # Did the dump succeed for all target systems?
        dump_success = True
        # Holds any exceptions we can't recover from
        errors = queue.Queue()

        if len(self.doc_managers) == 1:
            do_dump(self.doc_managers[0], errors)
        else:
            # Slight performance gain breaking dump into separate
            # threads if > 1 replication target
            for dm in self.doc_managers:
                t = threading.Thread(target=do_dump, args=(dm, errors))
                dumping_threads.append(t)
                t.start()
            # cleanup
            for t in dumping_threads:
                t.join()

        # Print caught exceptions
        try:
            while True:
                LOG.critical('Exception during collection dump',
                             exc_info=errors.get_nowait())
                dump_success = False
        except queue.Empty:
            pass

        if not dump_success:
            err_msg = "OplogThread: Failed during dump collection"
            effect = "cannot recover!"
            LOG.error('%s %s %s' % (err_msg, effect, self.oplog))
            self.running = False
            return None

        return timestamp
Beispiel #47
0
    def test_rollback(self):
        """Test rollback in oplog_manager. Assertion failure if it doesn't pass
            We force a rollback by inserting a doc, killing primary, inserting
            another doc, killing the new primary, and then restarting both
            servers.
        """

        os.system('rm %s; touch %s' % (CONFIG, CONFIG))
        if not start_cluster(sharded=True):
            self.fail("Shards cannot be added to mongos")

        test_oplog, primary_conn, solr, mongos = self.get_new_oplog()

        solr = test_oplog.doc_managers[0]
        solr._delete()          # equivalent to solr.delete(q='*:*')

        safe_mongo_op(mongos['alpha']['foo'].remove, {})
        safe_mongo_op(mongos['alpha']['foo'].insert,
                      {'_id': ObjectId('4ff74db3f646462b38000001'),
                      'name': 'paulie'})
        cutoff_ts = test_oplog.get_last_oplog_timestamp()

        obj2 = ObjectId('4ff74db3f646462b38000002')
        first_doc = {'name': 'paulie', '_ts': bson_ts_to_long(cutoff_ts),
                     'ns': 'alpha.foo', 
                     '_id': ObjectId('4ff74db3f646462b38000001')}

        # try kill one, try restarting
        kill_mongo_proc(primary_conn.host, PORTS_ONE['PRIMARY'])

        new_primary_conn = Connection(HOSTNAME, int(PORTS_ONE['SECONDARY']))
        admin_db = new_primary_conn['admin']
        while admin_db.command("isMaster")['ismaster'] is False:
            time.sleep(1)
        time.sleep(5)
        count = 0
        while True:
            try:
                mongos['alpha']['foo'].insert({'_id': obj2, 'name': 'paul'})
                break
            except OperationFailure:
                time.sleep(1)
                count += 1
                if count > 60:
                    self.fail('Insert failed too many times in rollback')
                continue

        kill_mongo_proc(primary_conn.host, PORTS_ONE['SECONDARY'])

        start_mongo_proc(PORTS_ONE['PRIMARY'], "demo-repl", "/replset1a",
                       "/replset1a.log", None)

        # wait for master to be established
        while primary_conn['admin'].command("isMaster")['ismaster'] is False:
            time.sleep(1)

        start_mongo_proc(PORTS_ONE['SECONDARY'], "demo-repl", "/replset1b",
                       "/replset1b.log", None)

        # wait for secondary to be established
        admin_db = new_primary_conn['admin']
        while admin_db.command("replSetGetStatus")['myState'] != 2:
            time.sleep(1)

        while retry_until_ok(mongos['alpha']['foo'].find().count) != 1:
            time.sleep(1)

        self.assertEqual(str(new_primary_conn.port), PORTS_ONE['SECONDARY'])
        self.assertEqual(str(primary_conn.port), PORTS_ONE['PRIMARY'])

        last_ts = test_oplog.get_last_oplog_timestamp()
        second_doc = {'name': 'paul', '_ts': bson_ts_to_long(last_ts),
                      'ns': 'alpha.foo', '_id': obj2}

        solr.upsert(first_doc)
        solr.upsert(second_doc)
        test_oplog.rollback()
        solr.commit()
        results = solr._search()

        self.assertEqual(len(results), 1)

        results_doc = results[0]
        self.assertEqual(results_doc['name'], 'paulie')
        self.assertTrue(results_doc['_ts'] <= bson_ts_to_long(cutoff_ts))
Beispiel #48
0
 def rollback_done():
     secondary1_count = retry_until_ok(db_secondary1.count)
     secondary2_count = retry_until_ok(db_secondary2.count)
     return (1, 1) == (secondary1_count, secondary2_count)
    def test_rollback(self):
        """Test the rollback method in a sharded environment

        Cases:
        1. Documents on both shards, rollback on one shard
        2. Documents on both shards, rollback on both shards

        """

        self.opman1.start()
        self.opman2.start()

        # Insert first documents while primaries are up
        db_main = self.mongos_conn["test"]["mcsharded"]
        db_main.insert({"i": 0}, w=2)
        db_main.insert({"i": 1000}, w=2)
        self.assertEqual(self.shard1_conn["test"]["mcsharded"].count(), 1)
        self.assertEqual(self.shard2_conn["test"]["mcsharded"].count(), 1)

        # Case 1: only one primary goes down, shard1 in this case
        kill_mongo_proc(self.shard1_prim_p, destroy=False)

        # Wait for the secondary to be promoted
        shard1_secondary_admin = self.shard1_secondary_conn["admin"]
        assert_soon(
            lambda: shard1_secondary_admin.command("isMaster")["ismaster"])

        # Insert another document. This will be rolled back later
        retry_until_ok(db_main.insert, {"i": 1})
        db_secondary1 = self.shard1_secondary_conn["test"]["mcsharded"]
        db_secondary2 = self.shard2_secondary_conn["test"]["mcsharded"]
        self.assertEqual(db_secondary1.count(), 2)

        # Wait for replication on the doc manager
        # Note that both OplogThreads share the same doc manager
        c = lambda: len(self.opman1.doc_managers[0]._search()) == 3
        assert_soon(c,
                    "not all writes were replicated to doc manager",
                    max_tries=120)

        # Kill the new primary
        kill_mongo_proc(self.shard1_scnd_p, destroy=False)

        # Start both servers back up
        restart_mongo_proc(self.shard1_prim_p)
        primary_admin = self.shard1_conn["admin"]
        c = lambda: primary_admin.command("isMaster")["ismaster"]
        assert_soon(lambda: retry_until_ok(c))
        restart_mongo_proc(self.shard1_scnd_p)
        secondary_admin = self.shard1_secondary_conn["admin"]
        c = lambda: secondary_admin.command("replSetGetStatus")["myState"] == 2
        assert_soon(c)
        query = {"i": {"$lt": 1000}}
        assert_soon(lambda: retry_until_ok(db_main.find(query).count) > 0)

        # Only first document should exist in MongoDB
        self.assertEqual(db_main.find(query).count(), 1)
        self.assertEqual(db_main.find_one(query)["i"], 0)

        # Same should hold for the doc manager
        docman_docs = [
            d for d in self.opman1.doc_managers[0]._search() if d["i"] < 1000
        ]
        self.assertEqual(len(docman_docs), 1)
        self.assertEqual(docman_docs[0]["i"], 0)

        # Wait for previous rollback to complete
        def rollback_done():
            secondary1_count = retry_until_ok(db_secondary1.count)
            secondary2_count = retry_until_ok(db_secondary2.count)
            return (1, 1) == (secondary1_count, secondary2_count)

        assert_soon(rollback_done,
                    "rollback never replicated to one or more secondaries")

        ##############################

        # Case 2: Primaries on both shards go down
        kill_mongo_proc(self.shard1_prim_p, destroy=False)
        kill_mongo_proc(self.shard2_prim_p, destroy=False)

        # Wait for the secondaries to be promoted
        shard1_secondary_admin = self.shard1_secondary_conn["admin"]
        shard2_secondary_admin = self.shard2_secondary_conn["admin"]
        assert_soon(
            lambda: shard1_secondary_admin.command("isMaster")["ismaster"])
        assert_soon(
            lambda: shard2_secondary_admin.command("isMaster")["ismaster"])

        # Insert another document on each shard. These will be rolled back later
        retry_until_ok(db_main.insert, {"i": 1})
        self.assertEqual(db_secondary1.count(), 2)
        retry_until_ok(db_main.insert, {"i": 1001})
        self.assertEqual(db_secondary2.count(), 2)

        # Wait for replication on the doc manager
        c = lambda: len(self.opman1.doc_managers[0]._search()) == 4
        assert_soon(c, "not all writes were replicated to doc manager")

        # Kill the new primaries
        kill_mongo_proc(self.shard1_scnd_p, destroy=False)
        kill_mongo_proc(self.shard2_scnd_p, destroy=False)

        # Start the servers back up...
        # Shard 1
        restart_mongo_proc(self.shard1_prim_p)
        c = lambda: self.shard1_conn['admin'].command("isMaster")["ismaster"]
        assert_soon(lambda: retry_until_ok(c))
        restart_mongo_proc(self.shard1_scnd_p)
        secondary_admin = self.shard1_secondary_conn["admin"]
        c = lambda: secondary_admin.command("replSetGetStatus")["myState"] == 2
        assert_soon(c)
        # Shard 2
        restart_mongo_proc(self.shard2_prim_p)
        c = lambda: self.shard2_conn['admin'].command("isMaster")["ismaster"]
        assert_soon(lambda: retry_until_ok(c))
        restart_mongo_proc(self.shard2_scnd_p)
        secondary_admin = self.shard2_secondary_conn["admin"]
        c = lambda: secondary_admin.command("replSetGetStatus")["myState"] == 2
        assert_soon(c)

        # Wait for the shards to come online
        assert_soon(lambda: retry_until_ok(db_main.find(query).count) > 0)
        query2 = {"i": {"$gte": 1000}}
        assert_soon(lambda: retry_until_ok(db_main.find(query2).count) > 0)

        # Only first documents should exist in MongoDB
        self.assertEqual(db_main.find(query).count(), 1)
        self.assertEqual(db_main.find_one(query)["i"], 0)
        self.assertEqual(db_main.find(query2).count(), 1)
        self.assertEqual(db_main.find_one(query2)["i"], 1000)

        # Same should hold for the doc manager
        i_values = [d["i"] for d in self.opman1.doc_managers[0]._search()]
        self.assertEqual(len(i_values), 2)
        self.assertIn(0, i_values)
        self.assertIn(1000, i_values)
Beispiel #50
0
    def test_rollback(self):
        """Test the rollback method in a sharded environment

        Cases:
        1. Documents on both shards, rollback on one shard
        2. Documents on both shards, rollback on both shards

        """

        self.opman1.start()
        self.opman2.start()

        # Insert first documents while primaries are up
        db_main = self.mongos_conn["test"]["mcsharded"]
        db_main2 = db_main.with_options(write_concern=WriteConcern(w=2))
        db_main2.insert_one({"i": 0})
        db_main2.insert_one({"i": 1000})
        self.assertEqual(self.shard1_conn["test"]["mcsharded"].count(), 1)
        self.assertEqual(self.shard2_conn["test"]["mcsharded"].count(), 1)

        # Case 1: only one primary goes down, shard1 in this case
        self.cluster.shards[0].primary.stop(destroy=False)

        # Wait for the secondary to be promoted
        shard1_secondary_admin = self.shard1_secondary_conn["admin"]
        assert_soon(
            lambda: shard1_secondary_admin.command("isMaster")["ismaster"])

        # Insert another document. This will be rolled back later
        def insert_one(doc):
            if not db_main.find_one(doc):
                return db_main.insert_one(doc)
            return True
        assert_soon(lambda: retry_until_ok(insert_one, {"i": 1}),
                    "could not insert into shard1 with one node down")
        db_secondary1 = self.shard1_secondary_conn["test"]["mcsharded"]
        db_secondary2 = self.shard2_secondary_conn["test"]["mcsharded"]
        self.assertEqual(db_secondary1.count(), 2)

        # Wait for replication on the doc manager
        # Note that both OplogThreads share the same doc manager
        c = lambda: len(self.opman1.doc_managers[0]._search()) == 3
        assert_soon(c, "not all writes were replicated to doc manager",
                    max_tries=120)

        # Kill the new primary
        self.cluster.shards[0].secondary.stop(destroy=False)

        # Start both servers back up
        self.cluster.shards[0].primary.start()
        primary_admin = self.shard1_conn["admin"]
        c = lambda: primary_admin.command("isMaster")["ismaster"]
        assert_soon(lambda: retry_until_ok(c))
        self.cluster.shards[0].secondary.start()
        secondary_admin = self.shard1_secondary_conn["admin"]
        c = lambda: secondary_admin.command("replSetGetStatus")["myState"] == 2
        assert_soon(c)
        query = {"i": {"$lt": 1000}}
        assert_soon(lambda: retry_until_ok(db_main.find(query).count) > 0)

        # Only first document should exist in MongoDB
        self.assertEqual(db_main.find(query).count(), 1)
        self.assertEqual(db_main.find_one(query)["i"], 0)

        def check_docman_rollback():
            docman_docs = [d for d in self.opman1.doc_managers[0]._search()
                           if d["i"] < 1000]
            return len(docman_docs) == 1 and docman_docs[0]["i"] == 0
        assert_soon(check_docman_rollback,
                    "doc manager did not roll back")

        # Wait for previous rollback to complete.
        # Insert/delete one document to jump-start replication to secondaries
        # in MongoDB 3.x.
        db_main.insert_one({'i': -1})
        db_main.delete_one({'i': -1})

        def rollback_done():
            secondary1_count = retry_until_ok(db_secondary1.count)
            secondary2_count = retry_until_ok(db_secondary2.count)
            return (1, 1) == (secondary1_count, secondary2_count)
        assert_soon(rollback_done,
                    "rollback never replicated to one or more secondaries")

        ##############################

        # Case 2: Primaries on both shards go down
        self.cluster.shards[0].primary.stop(destroy=False)
        self.cluster.shards[1].primary.stop(destroy=False)

        # Wait for the secondaries to be promoted
        shard1_secondary_admin = self.shard1_secondary_conn["admin"]
        shard2_secondary_admin = self.shard2_secondary_conn["admin"]
        assert_soon(
            lambda: shard1_secondary_admin.command("isMaster")["ismaster"])
        assert_soon(
            lambda: shard2_secondary_admin.command("isMaster")["ismaster"])

        # Insert another document on each shard which will be rolled back later
        assert_soon(lambda: retry_until_ok(insert_one, {"i": 1}),
                    "could not insert into shard1 with one node down")
        self.assertEqual(db_secondary1.count(), 2)
        assert_soon(lambda: retry_until_ok(insert_one, {"i": 1001}),
                    "could not insert into shard2 with one node down")
        self.assertEqual(db_secondary2.count(), 2)

        # Wait for replication on the doc manager
        c = lambda: len(self.opman1.doc_managers[0]._search()) == 4
        assert_soon(c, "not all writes were replicated to doc manager")

        # Kill the new primaries
        self.cluster.shards[0].secondary.stop(destroy=False)
        self.cluster.shards[1].secondary.stop(destroy=False)

        # Start the servers back up...
        # Shard 1
        self.cluster.shards[0].primary.start()
        c = lambda: self.shard1_conn['admin'].command("isMaster")["ismaster"]
        assert_soon(lambda: retry_until_ok(c))
        self.cluster.shards[0].secondary.start()
        secondary_admin = self.shard1_secondary_conn["admin"]
        c = lambda: secondary_admin.command("replSetGetStatus")["myState"] == 2
        assert_soon(c)
        # Shard 2
        self.cluster.shards[1].primary.start()
        c = lambda: self.shard2_conn['admin'].command("isMaster")["ismaster"]
        assert_soon(lambda: retry_until_ok(c))
        self.cluster.shards[1].secondary.start()
        secondary_admin = self.shard2_secondary_conn["admin"]
        c = lambda: secondary_admin.command("replSetGetStatus")["myState"] == 2
        assert_soon(c)

        # Wait for the shards to come online
        assert_soon(lambda: retry_until_ok(db_main.find(query).count) > 0)
        query2 = {"i": {"$gte": 1000}}
        assert_soon(lambda: retry_until_ok(db_main.find(query2).count) > 0)

        # Only first documents should exist in MongoDB
        self.assertEqual(db_main.find(query).count(), 1)
        self.assertEqual(db_main.find_one(query)["i"], 0)
        self.assertEqual(db_main.find(query2).count(), 1)
        self.assertEqual(db_main.find_one(query2)["i"], 1000)

        # Same should hold for the doc manager
        assert_soon(lambda: len(self.opman1.doc_managers[0]._search()) == 2)
        i_values = [d["i"] for d in self.opman1.doc_managers[0]._search()]
        self.assertIn(0, i_values)
        self.assertIn(1000, i_values)
    def get_oplog_cursor(self, timestamp):
        """Move cursor to the proper place in the oplog.
        """

        logging.debug("OplogThread: Getting the oplog cursor and moving it "
                      "to the proper place in the oplog.")

        if timestamp is None:
            return None

        cursor, cursor_len = None, 0
        while (True):
            try:
                logging.debug("OplogThread: Getting the oplog cursor "
                              "in the while true loop for get_oplog_cursor")
                if not self.namespace_set_to_skip:
                    if not self.namespace_set:
                        cursor = self.oplog.find({'ts': {
                            '$gte': timestamp
                        }},
                                                 tailable=True,
                                                 await_data=True)
                    else:
                        cursor = self.oplog.find(
                            {
                                'ts': {
                                    '$gte': timestamp
                                },
                                'ns': {
                                    '$in': self.namespace_set
                                }
                            },
                            tailable=True,
                            await_data=True)
                else:  ## user wants to skip
                    if not self.namespace_set:
                        cursor = self.oplog.find(
                            {
                                'ts': {
                                    '$gte': timestamp
                                },
                                'ns': {
                                    '$nin': self.namespace_set_to_skip
                                }
                            },
                            tailable=True,
                            await_data=True)
                    else:  # here user might have both ns must be in namespace_set and namespace_set_to_skip, just use namespace_set
                        cursor = self.oplog.find(
                            {
                                'ts': {
                                    '$gte': timestamp
                                },
                                'ns': {
                                    '$in': self.namespace_set
                                }
                            },
                            tailable=True,
                            await_data=True)
                # Applying 8 as the mask to the cursor enables OplogReplay
                cursor.add_option(8)
                logging.debug("OplogThread: Cursor created, getting a count.")
                cursor_len = cursor.count()
                logging.debug("OplogThread: Count is %d" % cursor_len)
                break
            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError):
                pass
        if cursor_len == 0:
            logging.debug("OplogThread: Initiating rollback from "
                          "get_oplog_cursor")
            #rollback, we are past the last element in the oplog
            timestamp = self.rollback()

            logging.info('Finished rollback')
            return self.get_oplog_cursor(timestamp)
        first_oplog_entry = retry_until_ok(lambda: cursor[0])
        cursor_ts_long = util.bson_ts_to_long(first_oplog_entry.get("ts"))
        given_ts_long = util.bson_ts_to_long(timestamp)
        if cursor_ts_long > given_ts_long:
            # first entry in oplog is beyond timestamp, we've fallen behind!
            return None
        elif cursor_len == 1:  # means we are the end of the oplog
            self.checkpoint = timestamp
            #to commit new TS after rollbacks

            return cursor
        elif cursor_len > 1:
            doc = retry_until_ok(next, cursor)
            if timestamp == doc['ts']:
                return cursor
            else:  # error condition
                logging.error('OplogThread: %s Bad timestamp in config file' %
                              self.oplog)
                return None
def start_cluster(sharded=False, key_file=None, use_mongos=True):
    """Sets up cluster with 1 shard, replica set with 3 members
    """
    # Kill all spawned mongods
    kill_all_mongo_proc('localhost', PORTS_ONE)
    kill_all_mongo_proc('localhost', PORTS_TWO)

    # Kill all spawned mongos
    kill_mongos_proc()

    # reset data dirs
    remove_dir(DEMO_SERVER_LOG)
    remove_dir(DEMO_SERVER_DATA)

    create_dir(DEMO_SERVER_DATA + "/standalone/journal")
    create_dir(DEMO_SERVER_DATA + "/replset1a/journal")
    create_dir(DEMO_SERVER_DATA + "/replset1b/journal")
    create_dir(DEMO_SERVER_DATA + "/replset1c/journal")

    if sharded:
        create_dir(DEMO_SERVER_DATA + "/replset2a/journal")
        create_dir(DEMO_SERVER_DATA + "/replset2b/journal")
        create_dir(DEMO_SERVER_DATA + "/replset2c/journal")

    create_dir(DEMO_SERVER_DATA + "/shard1a/journal")
    create_dir(DEMO_SERVER_DATA + "/shard1b/journal")
    create_dir(DEMO_SERVER_DATA + "/config1/journal")
    create_dir(DEMO_SERVER_LOG)

    # Create the replica set
    start_mongo_proc(PORTS_ONE["PRIMARY"], "demo-repl", "/replset1a",
                     "/replset1a.log", key_file)
    start_mongo_proc(PORTS_ONE["SECONDARY"], "demo-repl", "/replset1b",
                     "/replset1b.log", key_file)
    start_mongo_proc(PORTS_ONE["ARBITER"], "demo-repl", "/replset1c",
                     "/replset1c.log", key_file)

    if sharded:
        start_mongo_proc(PORTS_TWO["PRIMARY"], "demo-repl-2", "/replset2a",
                         "/replset2a.log", key_file)
        start_mongo_proc(PORTS_TWO["SECONDARY"], "demo-repl-2", "/replset2b",
                         "/replset2b.log", key_file)
        start_mongo_proc(PORTS_TWO["ARBITER"], "demo-repl-2", "/replset2c",
                         "/replset2c.log", key_file)

    # Setup config server
    cmd = ("mongod --oplogSize 500 --fork --configsvr --noprealloc --port " +
           PORTS_ONE["CONFIG"] + " --dbpath " + DEMO_SERVER_DATA +
           "/config1 --rest --logpath " + DEMO_SERVER_LOG +
           "/config1.log --logappend")

    if key_file is not None:
        cmd += " --keyFile " + key_file

    cmd += " &"
    execute_command(cmd)
    check_started(int(PORTS_ONE["CONFIG"]))

    # Setup the mongos, same mongos for both shards
    cmd = [
        "mongos --port " + PORTS_ONE["MONGOS"] +
        " --fork --configdb localhost:" + PORTS_ONE["CONFIG"] +
        " --chunkSize 1  --logpath " + DEMO_SERVER_LOG +
        "/mongos1.log --logappend"
    ]

    if key_file is not None:
        cmd += " --keyFile " + key_file

    cmd += " &"
    if use_mongos:
        execute_command(cmd)
        check_started(int(PORTS_ONE["MONGOS"]))

    # configuration for replSet 1
    config = {
        '_id':
        "demo-repl",
        'members': [{
            '_id': 0,
            'host': "localhost:27117"
        }, {
            '_id': 1,
            'host': "localhost:27118"
        }, {
            '_id': 2,
            'host': "localhost:27119",
            'arbiterOnly': 'true'
        }]
    }

    # configuration for replSet 2, not always used
    config2 = {
        '_id':
        "demo-repl-2",
        'members': [{
            '_id': 0,
            'host': "localhost:27317"
        }, {
            '_id': 1,
            'host': "localhost:27318"
        }, {
            '_id': 2,
            'host': "localhost:27319",
            'arbiterOnly': 'true'
        }]
    }

    primary = Connection('localhost:27117')
    if use_mongos:
        mongos = Connection('localhost:27217')
    primary.admin.command("replSetInitiate", config)

    # ensure that the replSet is properly configured
    while retry_until_ok(primary.admin.command,
                         "replSetGetStatus")['myState'] == 0:
        time.sleep(1)

    if use_mongos:
        counter = 100
        while counter > 0:
            try:
                mongos.admin.command("addShard", "demo-repl/localhost:27117")
                break
            except OperationFailure:  # replSet not ready yet
                counter -= 1
                time.sleep(1)

        if counter == 0:
            return False

    if sharded:
        primary2 = Connection('localhost:27317')
        primary2.admin.command("replSetInitiate", config2)

        while retry_until_ok(primary2.admin.command,
                             "replSetGetStatus")['myState'] == 0:
            time.sleep(1)

        counter = 100
        while counter > 0:
            try:
                admin_db = mongos.admin
                admin_db.command("addShard",
                                 "demo-repl-2/localhost:27317",
                                 maxSize=1)
                break
            except OperationFailure:  # replSet not ready yet
                counter -= 1
                time.sleep(1)

        if counter == 0:
            return False

        # shard on the alpha.foo collection
        admin_db = mongos.admin
        admin_db.command("enableSharding", "alpha")
        admin_db.command("shardCollection", "alpha.foo", key={"_id": 1})

    primary = Connection('localhost:27117')
    admin = primary['admin']
    while admin.command("isMaster")['ismaster'] is False:
        time.sleep(1)
    secondary = Connection('localhost:27118')
    while secondary.admin.command("replSetGetStatus")['myState'] is not 2:
        time.sleep(1)
    return True
Beispiel #53
0
    def run(self):
        """Start the oplog worker.
        """
        logging.debug("OplogThread: Run thread started")
        while self.running is True:
            logging.debug("OplogThread: Getting cursor")
            cursor = self.init_cursor()
            logging.debug("OplogThread: Got the cursor, go go go!")

            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                logging.error('%s %s %s' % (err_msg, effect, self.oplog))
                self.running = False
                continue

            #The only entry is the last one we processed
            if cursor is None or util.retry_until_ok(cursor.count) == 1:
                logging.debug("OplogThread: Last entry is the one we "
                              "already processed.  Up to date.  Sleeping.")
                time.sleep(1)
                continue

            last_ts = None
            err = False
            remove_inc = 0
            upsert_inc = 0
            try:
                logging.debug("OplogThread: about to process new oplog "
                              "entries")
                while cursor.alive and self.running:
                    logging.debug("OplogThread: Cursor is still"
                                  " alive and thread is still running.")
                    for n, entry in enumerate(cursor):

                        logging.debug("OplogThread: Iterating through cursor,"
                                      " document number in this cursor is %d" %
                                      n)
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        # Don't replicate entries resulting from chunk moves
                        if entry.get("fromMigrate"):
                            continue

                        #sync the current oplog operation
                        operation = entry['op']
                        ns = entry['ns']

                        # use namespace mapping if one exists
                        ns = self.dest_mapping.get(entry['ns'], ns)

                        #delete
                        try:
                            logging.debug("OplogThread: Operation for this "
                                          "entry is %s" % str(operation))
                            if operation == 'd':
                                entry['_id'] = entry['o']['_id']
                                for dm in self.doc_managers:
                                    remove_inc += 1
                                    dm.remove(entry)
                            #insert/update. They are equal because of lack
                            #of support for partial update
                            elif operation == 'i' or operation == 'u':
                                doc = self.retrieve_doc(entry)
                                if doc is not None:
                                    doc['_ts'] = util.bson_ts_to_long(
                                        entry['ts'])
                                    doc['ns'] = ns
                                    for dm in self.doc_managers:
                                        upsert_inc += 1
                                        dm.upsert(doc)
                        except errors.OperationFailed:
                            logging.error(
                                "Unable to %s doc with id %s" %
                                ("delete" if operation == "d" else "upsert",
                                 str(entry['_id'])))
                        except errors.ConnectionFailed:
                            logging.error(
                                "Network error while trying to %s %s" %
                                ("delete" if operation == "d" else "upsert",
                                 str(entry['_id'])))

                        if (remove_inc + upsert_inc) % 1000 == 0:
                            logging.debug("OplogThread: Removed %d "
                                          " documents and upserted %d "
                                          " documents so far" %
                                          (remove_inc, upsert_inc))

                        logging.debug("OplogThread: Doc is processed.")

                        last_ts = entry['ts']

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1 and last_ts is not None:
                            self.checkpoint = last_ts
                            self.update_checkpoint()

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        logging.debug("OplogThread: updating checkpoint after"
                                      "processing new oplog entries")
                        self.checkpoint = last_ts
                        self.update_checkpoint()

            except (pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError):
                err = True

            if err is True and self.auth_key is not None:
                self.primary_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                self.main_connection['admin'].authenticate(
                    self.auth_username, self.auth_key)
                err = False

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                logging.debug("OplogThread: updating checkpoint after an "
                              "Exception, cursor closing, or join() on this"
                              "thread.")
                self.checkpoint = last_ts
                self.update_checkpoint()

            logging.debug("OplogThread: Sleeping.  This batch I removed %d "
                          " documents and I upserted %d documents." %
                          (remove_inc, upsert_inc))
            time.sleep(2)
Beispiel #54
0
    def init_cursor(self):
        """Position the cursor appropriately.

        The cursor is set to either the beginning of the oplog, or
        wherever it was last left off.

        Returns the cursor and the number of documents left in the cursor.
        """
        timestamp = self.read_last_checkpoint()

        if timestamp is None:
            if self.collection_dump:
                # dump collection and update checkpoint
                timestamp = self.dump_collection()
                if timestamp is None:
                    return None, 0
            else:
                # Collection dump disabled:
                # return cursor to beginning of oplog.
                cursor = self.get_oplog_cursor()
                self.checkpoint = self.get_last_oplog_timestamp()
                self.update_checkpoint()
                return cursor, retry_until_ok(cursor.count)

        self.checkpoint = timestamp
        self.update_checkpoint()

        for i in range(60):
            cursor = self.get_oplog_cursor(timestamp)
            cursor_len = retry_until_ok(cursor.count)

            if cursor_len == 0:
                # rollback, update checkpoint, and retry
                LOG.debug("OplogThread: Initiating rollback from "
                          "get_oplog_cursor")
                self.checkpoint = self.rollback()
                self.update_checkpoint()
                return self.init_cursor()

            # try to get the first oplog entry
            try:
                first_oplog_entry = retry_until_ok(next, cursor)
            except StopIteration:
                # It's possible for the cursor to become invalid
                # between the cursor.count() call and now
                time.sleep(1)
                continue

            # first entry should be last oplog entry processed
            cursor_ts_long = util.bson_ts_to_long(first_oplog_entry.get("ts"))
            given_ts_long = util.bson_ts_to_long(timestamp)
            if cursor_ts_long > given_ts_long:
                # first entry in oplog is beyond timestamp
                # we've fallen behind
                return None, 0

            # first entry has been consumed
            return cursor, cursor_len - 1

        else:
            raise errors.MongoConnectorError(
                "Could not initialize oplog cursor.")
Beispiel #55
0
    def run(self):
        """Discovers the mongo cluster and creates a thread for each primary.
        """
        self.main_conn = self.create_authed_client()
        self.read_oplog_progress()
        conn_type = None

        try:
            self.main_conn.admin.command("isdbgrid")
        except pymongo.errors.OperationFailure:
            conn_type = "REPLSET"

        if conn_type == "REPLSET":
            # Make sure we are connected to a replica set
            is_master = self.main_conn.admin.command("isMaster")
            if "setName" not in is_master:
                LOG.error(
                    'No replica set at "%s"! A replica set is required '
                    'to run mongo-connector. Shutting down...' % self.address
                )
                return

            # Establish a connection to the replica set as a whole
            self.main_conn.close()
            self.main_conn = self.create_authed_client(
                replicaSet=is_master['setName'])

            # non sharded configuration
            oplog = OplogThread(
                self.main_conn, self.doc_managers, self.oplog_progress,
                self.dest_mapping, **self.kwargs)
            self.shard_set[0] = oplog
            LOG.info('MongoConnector: Starting connection thread %s' %
                     self.main_conn)
            oplog.start()

            while self.can_run:
                shard_thread = self.shard_set[0]
                if not (shard_thread.running and shard_thread.is_alive()):
                    LOG.error("MongoConnector: OplogThread"
                              " %s unexpectedly stopped! Shutting down" %
                              (str(self.shard_set[0])))
                    self.oplog_thread_join()
                    for dm in self.doc_managers:
                        dm.stop()
                    return

                self.write_oplog_progress()
                time.sleep(1)

        else:       # sharded cluster
            while self.can_run is True:

                for shard_doc in retry_until_ok(self.main_conn.admin.command,
                                                'listShards')['shards']:
                    shard_id = shard_doc['_id']
                    if shard_id in self.shard_set:
                        shard_thread = self.shard_set[shard_id]
                        if not (shard_thread.running and shard_thread.is_alive()):
                            LOG.error("MongoConnector: OplogThread "
                                      "%s unexpectedly stopped! Shutting "
                                      "down" %
                                      (str(self.shard_set[shard_id])))
                            self.oplog_thread_join()
                            for dm in self.doc_managers:
                                dm.stop()
                            return

                        self.write_oplog_progress()
                        time.sleep(1)
                        continue
                    try:
                        repl_set, hosts = shard_doc['host'].split('/')
                    except ValueError:
                        cause = "The system only uses replica sets!"
                        LOG.exception("MongoConnector: %s", cause)
                        self.oplog_thread_join()
                        for dm in self.doc_managers:
                            dm.stop()
                        return

                    shard_conn = self.create_authed_client(
                        hosts, replicaSet=repl_set)
                    oplog = OplogThread(
                        shard_conn, self.doc_managers, self.oplog_progress,
                        self.dest_mapping, mongos_client=self.main_conn,
                        **self.kwargs)
                    self.shard_set[shard_id] = oplog
                    msg = "Starting connection thread"
                    LOG.info("MongoConnector: %s %s" % (msg, shard_conn))
                    oplog.start()

        self.oplog_thread_join()
        self.write_oplog_progress()
Beispiel #56
0
 def commit(self):
     """Send buffered requests and refresh all indexes."""
     self.send_buffered_operations()
     retry_until_ok(self.elastic.indices.refresh, index="")
    def test_rollback(self):
        """Tests rollback. We force a rollback by adding a doc, killing the
            primary, adding another doc, killing the new primary, and then
            restarting both.
        """

        primary_conn = Connection(HOSTNAME, int(PORTS_ONE['PRIMARY']))

        self.conn['test']['test'].insert({'name': 'paul'}, safe=True)
        condition1 = lambda: self.conn['test']['test'].find({
            'name': 'paul'
        }).count() == 1
        condition2 = lambda: sum(1 for _ in self.elastic_doc._search()) == 1
        wait_for(condition1)
        wait_for(condition2)

        kill_mongo_proc(HOSTNAME, PORTS_ONE['PRIMARY'])

        new_primary_conn = Connection(HOSTNAME, int(PORTS_ONE['SECONDARY']))

        admin = new_primary_conn['admin']
        wait_for(lambda: admin.command("isMaster")['ismaster'])
        time.sleep(5)

        count = 0
        while True:
            try:
                self.conn['test']['test'].insert({'name': 'pauline'},
                                                 safe=True)
                break
            except OperationFailure:
                time.sleep(1)
                count += 1
                if count >= 60:
                    sys.exit(1)
                continue
        wait_for(lambda: sum(1 for _ in self.elastic_doc._search()) == 2)
        result_set_1 = list(self.elastic_doc._search())
        result_set_2 = self.conn['test']['test'].find_one({'name': 'pauline'})
        self.assertEqual(len(result_set_1), 2)
        #make sure pauline is there
        for item in result_set_1:
            if item['name'] == 'pauline':
                self.assertEqual(item['_id'], str(result_set_2['_id']))
        kill_mongo_proc(HOSTNAME, PORTS_ONE['SECONDARY'])

        start_mongo_proc(PORTS_ONE['PRIMARY'], "demo-repl", "/replset1a",
                         "/replset1a.log", None)
        while primary_conn['admin'].command("isMaster")['ismaster'] is False:
            time.sleep(1)

        start_mongo_proc(PORTS_ONE['SECONDARY'], "demo-repl", "/replset1b",
                         "/replset1b.log", None)

        time.sleep(2)
        result_set_1 = list(self.elastic_doc._search())
        self.assertEqual(len(result_set_1), 1)
        for item in result_set_1:
            self.assertEqual(item['name'], 'paul')
        find_cursor = retry_until_ok(self.conn['test']['test'].find)
        self.assertEqual(retry_until_ok(find_cursor.count), 1)
Beispiel #58
0
 def commit(self):
     """Refresh all Elasticsearch indexes."""
     retry_until_ok(self.elastic.indices.refresh, index="")
    def run(self):
        """Start the oplog worker.
        """
        ReplicationLagLogger(self, 30).start()
        LOG.debug("OplogThread: Run thread started")
        while self.running is True:
            LOG.debug("OplogThread: Getting cursor")
            cursor, cursor_empty = retry_until_ok(self.init_cursor)
            # we've fallen too far behind
            if cursor is None and self.checkpoint is not None:
                err_msg = "OplogThread: Last entry no longer in oplog"
                effect = "cannot recover!"
                LOG.error("%s %s %s" % (err_msg, effect, self.oplog))
                self.running = False
                continue

            if cursor_empty:
                LOG.debug("OplogThread: Last entry is the one we "
                          "already processed.  Up to date.  Sleeping.")
                time.sleep(1)
                continue

            last_ts = None
            remove_inc = 0
            upsert_inc = 0
            update_inc = 0
            try:
                LOG.debug("OplogThread: about to process new oplog entries")
                while cursor.alive and self.running:
                    LOG.debug("OplogThread: Cursor is still"
                              " alive and thread is still running.")
                    for n, entry in enumerate(cursor):
                        # Break out if this thread should stop
                        if not self.running:
                            break

                        LOG.debug("OplogThread: Iterating through cursor,"
                                  " document number in this cursor is %d" % n)

                        LOG.always(
                            "OplogThread: terating through cursor, n {n}, entry {entry}"
                            .format(n=n, entry=entry))

                        skip, is_gridfs_file = self._should_skip_entry(entry)
                        if skip:
                            # update the last_ts on skipped entries to ensure
                            # our checkpoint does not fall off the oplog. This
                            # also prevents reprocessing skipped entries.
                            last_ts = entry["ts"]
                            continue

                        # Sync the current oplog operation
                        operation = entry["op"]
                        ns = entry["ns"]

                        timestamp = util.bson_ts_to_long(entry["ts"])
                        for docman in self.doc_managers:
                            try:
                                LOG.debug("OplogThread: Operation for this "
                                          "entry is %s" % str(operation))

                                # Remove
                                if operation == "d":
                                    docman.remove(entry["o"]["_id"], ns,
                                                  timestamp)
                                    remove_inc += 1

                                # Insert
                                elif operation == "i":  # Insert
                                    # Retrieve inserted document from
                                    # 'o' field in oplog record
                                    doc = entry.get("o")
                                    # Extract timestamp and namespace
                                    if is_gridfs_file:
                                        db, coll = ns.split(".", 1)
                                        gridfile = GridFSFile(
                                            self.primary_client[db][coll], doc)
                                        docman.insert_file(
                                            gridfile, ns, timestamp)
                                    else:
                                        docman.upsert(doc, ns, timestamp)
                                    upsert_inc += 1

                                # Update
                                elif operation == "u":
                                    docman.update(entry["o2"]["_id"],
                                                  entry["o"], ns, timestamp)
                                    update_inc += 1

                                # Command
                                elif operation == "c":
                                    # use unmapped namespace
                                    doc = entry.get("o")
                                    docman.handle_command(
                                        doc, entry["ns"], timestamp)

                            except errors.OperationFailed:
                                LOG.exception(
                                    "Unable to process oplog document %r" %
                                    entry)
                            except errors.ConnectionFailed:
                                LOG.exception(
                                    "Connection failed while processing oplog "
                                    "document %r" % entry)

                        if (remove_inc + upsert_inc + update_inc) % 1000 == 0:
                            LOG.debug("OplogThread: Documents removed: %d, "
                                      "inserted: %d, updated: %d so far" %
                                      (remove_inc, upsert_inc, update_inc))

                        LOG.debug("OplogThread: Doc is processed.")

                        last_ts = entry["ts"]

                        # update timestamp per batch size
                        # n % -1 (default for self.batch_size) == 0 for all n
                        if n % self.batch_size == 1:
                            self.update_checkpoint(last_ts)
                            last_ts = None

                    # update timestamp after running through oplog
                    if last_ts is not None:
                        LOG.debug("OplogThread: updating checkpoint after "
                                  "processing new oplog entries")
                        self.update_checkpoint(last_ts)

            except (
                    pymongo.errors.AutoReconnect,
                    pymongo.errors.OperationFailure,
                    pymongo.errors.ConfigurationError,
            ):
                LOG.exception("Cursor closed due to an exception. "
                              "Will attempt to reconnect.")

            # update timestamp before attempting to reconnect to MongoDB,
            # after being join()'ed, or if the cursor closes
            if last_ts is not None:
                LOG.debug("OplogThread: updating checkpoint after an "
                          "Exception, cursor closing, or join() on this"
                          "thread.")
                self.update_checkpoint(last_ts)

            LOG.debug("OplogThread: Sleeping. Documents removed: %d, "
                      "upserted: %d, updated: %d" %
                      (remove_inc, upsert_inc, update_inc))
            time.sleep(2)
 def commit(self):
     """This function is used to force a commit.
     """
     retry_until_ok(self.solr.commit)