コード例 #1
0
    def perform_operation_during_bucket_warmup(self, during_warmup="default"):
        # stop memcached in master node
        shell_conn = RemoteMachineShellConnection(self.cluster.master)
        self.error_sim = CouchbaseError(self.log, shell_conn)
        self.error_sim.create(CouchbaseError.STOP_MEMCACHED)
        self.log.info("memcached stopped on master node")

        if during_warmup == "create_scope":
            self.scope_name = self.bucket_util.get_random_name()
            self.create_scope()
            self.log.info("drop scope succeeded")

        elif during_warmup == "drop_scope":
            retry = 5
            while retry > 0:
                scope_dict = self.bucket_util.get_random_scopes(
                    self.bucket_util.buckets, 1, 1)
                self.scope_name = scope_dict[
                    self.bucket.name]["scopes"].keys()[0]
                if self.scope_name != "_default":
                    break
                retry -= 1
            self.drop_scope()
            self.log.info("drop scope succeeded")

        elif during_warmup == "create_collection":
            self.collection_name = self.bucket_util.get_random_name()
            self.create_collection()
            self.log.info("create collection succeeded")

        elif during_warmup == "drop_collection":
            collections = self.bucket_util.get_random_collections(
                self.bucket_util.buckets, 1, 1, 1)
            scope_dict = collections[self.bucket.name]["scopes"]
            self.scope_name = scope_dict.keys()[0]
            self.collection_name = scope_dict[
                self.scope_name]["collections"].keys()[0]
            self.drop_collection()
            self.log.info("drop collection succeeded")

        else:
            try:
                self.random_load()
                self.log_failure("random operation succeeded")
            except Exception as e:
                self.log.info(e)
                self.error_sim.revert(CouchbaseError.STOP_MEMCACHED)
                self.random_load()

        self.error_sim.revert(CouchbaseError.STOP_MEMCACHED)
        self.bucket_util.validate_docs_per_collections_all_buckets()
        self.validate_test_failure()
コード例 #2
0
ファイル: cb_collect.py プロジェクト: sreebhargava143/TAF
    def test_with_server_stopped(self):
        """
        1. Disable auto-failover in the cluster
        2. Stop few servers on the cluster
        3. Run cb_collect_info on all nodes
        4. Make sure cb_collect works for stopped nodes as well
        """

        service_to_stop = self.input.param("affect_nodes_with_service",
                                           "kv").split(";")
        num_nodes_to_affect = self.input.param("num_nodes_to_affect", 1)

        nodes_in_cluster = self.__get_server_nodes()
        nodes_to_stop = sample(self.__get_server_nodes(service_to_stop),
                               num_nodes_to_affect)

        # Disable auto-failover to avoid failover of nodes
        status = RestConnection(self.cluster.master) \
            .update_autofailover_settings(False, 120, False)
        self.assertTrue(status, msg="Failure during disabling auto-failover")

        self.log.info("Nodes to stop - %s" % nodes_to_stop)
        for node in nodes_to_stop:
            cb_error = CouchbaseError(self.log, self.node_data[node]["shell"])
            self.node_data[node]["cb_error"] = cb_error
            self.node_data[node]["cb_error"].create(CouchbaseError.STOP_SERVER)

        for node in nodes_in_cluster:
            self.node_data[node]["cb_collect_task"] = Thread(
                target=self.cluster_util.run_cb_collect,
                args=[node, self.node_data[node]["cb_collect_file"]],
                kwargs={
                    "options": "",
                    "result": self.node_data[node]["cb_collect_result"]
                })
            self.node_data[node]["cb_collect_task"].start()

        for node in nodes_in_cluster:
            try:
                t_node = self.node_data[node]
                t_node["cb_collect_task"].join(300)
                if str(t_node["cb_collect_result"]["file_size"]) == "0":
                    self.log_failure("%s - cbcollect file size is zero" %
                                     node.ip)
            except RuntimeError as e:
                self.log_failure("%s cbcollect_info timed-out: %s" %
                                 (node.ip, e))

        # Restarting stopped nodes
        for node in nodes_to_stop:
            self.node_data[node]["cb_error"].revert(CouchbaseError.STOP_SERVER)

        self.bucket_util.is_warmup_complete(self.bucket_util.buckets)
        self.validate_test_failure()
コード例 #3
0
ファイル: Collections.py プロジェクト: umang-cb/TAF
 def induce_and_revert_failure(self, action):
     target_node = self.servers[-1]  # select last node
     remote = RemoteMachineShellConnection(target_node)
     error_sim = CouchbaseError(self.log, remote)
     error_sim.create(action)
     self.sleep(20, "Wait before reverting the error condition")
     if action in [CouchbaseError.STOP_MEMCACHED, CouchbaseError.STOP_PROMETHEUS]:
         # Revert the simulated error condition explicitly. In kill memcached, prometheus
         # babysitter will bring back the process automatically
         error_sim.revert(action)
     remote.disconnect()
コード例 #4
0
 def stop_process(self):
     target_node = self.servers[2]
     remote = RemoteMachineShellConnection(target_node)
     error_sim = CouchbaseError(self.log, remote)
     error_to_simulate = "stop_memcached"
     # Induce the error condition
     error_sim.create(error_to_simulate)
     self.sleep(20, "Wait before reverting the error condition")
     # Revert the simulated error condition and close the ssh session
     error_sim.revert(error_to_simulate)
     remote.disconnect()
コード例 #5
0
    def test_with_sync_write(self):
        cluster_node = choice(self.kv_nodes)
        target_vb_type, simulate_error = \
            DurabilityHelper.get_vb_and_error_type(self.durability_level)
        doc_gen = doc_generator(
            self.key,
            0,
            2,
            target_vbucket=self.node_data[cluster_node]["%s_vbs" %
                                                        target_vb_type])
        client = self.sdk_client_pool.get_client_for_bucket(
            self.bucket, self.scope_name, self.collection_name)

        key_1, value_1 = doc_gen.next()
        key_2, value_2 = doc_gen.next()

        if self.doc_ops[0] != DocLoading.Bucket.DocOps.CREATE:
            client.crud(DocLoading.Bucket.DocOps.CREATE, key_1, value_1)
        if self.doc_ops[1] != DocLoading.Bucket.DocOps.CREATE:
            client.crud(DocLoading.Bucket.DocOps.CREATE, key_2, value_2)

        sync_op = Thread(target=self.crud,
                         args=[client, self.doc_ops[0], key_1],
                         kwargs={
                             "value": value_1,
                             "durability": self.durability_level,
                             "expected_thread_val": 1
                         })
        async_op = Thread(target=self.crud,
                          args=[client, self.doc_ops[1], key_2],
                          kwargs={
                              "value": value_2,
                              "expected_thread_val": 0
                          })

        cb_err = CouchbaseError(self.log,
                                self.node_data[cluster_node]["shell"])
        cb_err.create(simulate_error, self.bucket.name)

        # Start doc_ops
        sync_op.start()
        self.sleep(1, "Wait before async operation")
        async_op.start()

        # Wait for ops to complete
        async_op.join()
        cb_err.revert(simulate_error, self.bucket.name)
        sync_op.join()

        self.validate_test_failure()
コード例 #6
0
 def test_prometheus_and_ns_server_stats_after_crash_scenarios(self):
     """
     Run all metrics before and after crash and validate
     both ns_server and prometheus stats
     """
     self.bucket_util.load_sample_bucket(self.cluster, TravelSample())
     target_node = self.servers[0]
     remote = RemoteMachineShellConnection(target_node)
     error_sim = CouchbaseError(self.log, remote)
     self.log.info("Before failure")
     self.get_all_metrics(self.components, self.parse, self.metric_name)
     try:
         self.log.info("Killing {0} on node {1}".format(
             self.process_name, target_node.ip))
         remote.kill_process(self.process_name,
                             self.service_name,
                             signum=signum[self.sig_type])
         self.sleep(20, "Wait for the process to come backup")
     finally:
         remote.disconnect()
     self.log.info("After failure")
     self.get_all_metrics(self.components, self.parse, self.metric_name)
コード例 #7
0
ファイル: basic_ops.py プロジェクト: AnithaKuberan/TAF
    def MB36948(self):
        node_to_stop = self.servers[0]
        self.log.info("Adding index/query node")
        self.task.rebalance([self.cluster.master], [self.servers[2]], [],
                            services=["n1ql,index"])
        self.log.info("Creating SDK client connection")
        client = SDKClient([self.cluster.master],
                           self.bucket_util.buckets[0],
                           compression_settings=self.sdk_compression)

        self.log.info("Stopping memcached on: %s" % node_to_stop)
        ssh_conn = RemoteMachineShellConnection(node_to_stop)
        err_sim = CouchbaseError(self.log, ssh_conn)
        err_sim.create(CouchbaseError.STOP_MEMCACHED)

        result = client.crud("create", "abort1", "abort1_val")
        if not result["status"]:
            self.log_failure("Async SET failed")

        result = client.crud("update",
                             "abort1",
                             "abort1_val",
                             durability=self.durability_level,
                             timeout=3,
                             time_unit="seconds")
        if result["status"]:
            self.log_failure("Sync write succeeded")
        if SDKException.DurabilityAmbiguousException not in result["error"]:
            self.log_failure("Invalid exception for sync_write: %s" % result)

        self.log.info("Resuming memcached on: %s" % node_to_stop)
        err_sim.revert(CouchbaseError.STOP_MEMCACHED)

        self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.verify_stats_all_buckets(1)

        self.log.info("Closing ssh & SDK connections")
        ssh_conn.disconnect()
        client.close()

        self.validate_test_failure()
コード例 #8
0
 def test_prometheus_and_ns_server_stats_after_failure_scenarios(self):
     """
     Run all metrics before and after failure scenarios and validate
     both ns_server and prometheus stats
     """
     self.bucket_util.load_sample_bucket(self.cluster, TravelSample())
     target_node = self.servers[0]
     remote = RemoteMachineShellConnection(target_node)
     error_sim = CouchbaseError(self.log, remote)
     self.log.info("Before failure")
     self.get_all_metrics(self.components, self.parse, self.metric_name)
     try:
         # Induce the error condition
         error_sim.create(self.simulate_error)
         self.sleep(20, "Wait before reverting the error condition")
     finally:
         # Revert the simulated error condition and close the ssh session
         error_sim.revert(self.simulate_error)
         remote.disconnect()
     self.log.info("After failure")
     self.get_all_metrics(self.components, self.parse, self.metric_name)
コード例 #9
0
ファイル: process_crash.py プロジェクト: bkumaran/TAF
    def test_create_remove_collection_with_node_crash(self):
        """
        1. Select a error scenario to simulate in random
        2. Create error scenario either before or after collection action
        3. Initiate collection creation/deletion under the bucket
        4. Validate the outcome of collection creation/deletion
        """
        def create_collection(client_type, bucket_obj, scope, collection):
            if client_type == "sdk":
                client.create_collection(collection, scope)
                self.bucket_util.create_collection_object(bucket_obj, scope,
                                                          {"name": collection})
            elif client_type == "rest":
                self.bucket_util.create_collection(self.cluster.master,
                                                   bucket_obj,
                                                   scope,
                                                   {"name": collection})
            else:
                self.log_failure("Invalid client_type provided")

        def remove_collection(client_type, bucket_obj, scope, collection):
            if client_type == "sdk":
                client.drop_collection(scope, collection)
                self.bucket_util.mark_collection_as_dropped(bucket_obj, scope,
                                                            collection)
            elif client_type == "rest":
                self.bucket_util.drop_collection(self.cluster.master,
                                                 bucket_obj, scope, collection)
            else:
                self.log_failure("Invalid client_type provided")

        kv_nodes = self.cluster_util.get_kv_nodes()
        if len(kv_nodes) == 1:
            self.fail("Need atleast two KV nodes to run this test")

        client = None
        task = None
        action = self.input.param("action", "create")
        crash_during = self.input.param("crash_during", "pre_action")
        data_load_option = self.input.param("data_load_option", None)
        crash_type = self.input.param("simulate_error",
                                      CouchbaseError.KILL_MEMCACHED)

        if self.scope_name != CbServer.default_scope:
            self.scope_name = \
                BucketUtils.get_random_name(
                    max_length=CbServer.max_scope_name_len)
            self.bucket_util.create_scope(self.cluster.master, self.bucket,
                                          {"name": self.scope_name})
        if self.collection_name != CbServer.default_collection:
            self.collection_name = \
                BucketUtils.get_random_name(
                    max_length=CbServer.max_collection_name_len)

        # Select a KV node other than master node from the cluster
        node_to_crash = kv_nodes[sample(range(1, len(kv_nodes)), 1)[0]]

        client = self.sdk_client_pool.get_client_for_bucket(self.bucket)
        use_client = sample(["sdk", "rest"], 1)[0]

        if action == "remove" \
                and self.collection_name != CbServer.default_collection:
            # Create a collection to be removed
            create_collection(use_client, self.bucket,
                              self.scope_name, self.collection_name)

        # Create a error scenario
        self.log.info("Selected scenario for test '%s'" % crash_type)
        shell = RemoteMachineShellConnection(node_to_crash)
        cb_error = CouchbaseError(self.log, shell)
        cbstat_obj = Cbstats(shell)
        active_vbs = cbstat_obj.vbucket_list(self.bucket.name,
                                             vbucket_type="active")
        target_vbuckets = list(
            set(range(0, 1024)).difference(set(active_vbs)))
        doc_gen = doc_generator(self.key, 0, 1000,
                                target_vbucket=target_vbuckets)

        if crash_during == "pre_action":
            cb_error.create(crash_type)

        if data_load_option == "mutate_default_collection":
            task = self.task.async_load_gen_docs(
                self.cluster, self.bucket, doc_gen,
                DocLoading.Bucket.DocOps.UPDATE,
                exp=self.maxttl,
                batch_size=200, process_concurrency=8,
                compression=self.sdk_compression,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout)

        if action == "create":
            create_collection(self.client_type, self.bucket,
                              self.scope_name, self.collection_name)
        elif action == "remove":
            remove_collection(self.client_type, self.bucket,
                              self.scope_name, self.collection_name)

        if crash_during == "post_action":
            cb_error.create(crash_type)

        if data_load_option == "mutate_default_collection":
            self.task_manager.get_task_result(task)

        self.sleep(60, "Wait before reverting the error scenario")
        cb_error.revert(crash_type)

        # Close SSH and SDK connections
        shell.disconnect()
        if self.atomicity is False:
            self.bucket_util.validate_docs_per_collections_all_buckets(
                self.cluster)
        self.validate_test_failure()
コード例 #10
0
ファイル: crash_process.py プロジェクト: AnithaKuberan/TAF
    def test_stop_process(self):
        """
        1. Starting loading docs into the default bucket
        2. Stop the requested process, which will impact the
           memcached operations
        3. Wait for load bucket task to complete
        4. Validate the docs for durability
        """
        error_to_simulate = self.input.param("simulate_error", None)
        def_bucket = self.bucket_util.buckets[0]
        target_node = self.getTargetNode()
        remote = RemoteMachineShellConnection(target_node)
        error_sim = CouchbaseError(self.log, remote)
        target_vbuckets = self.getVbucketNumbers(remote, def_bucket.name,
                                                 self.target_node)
        if len(target_vbuckets) == 0:
            self.log.error("No target vbucket list generated to load data")
            remote.disconnect()
            return

        # Create doc_generator targeting only the active/replica vbuckets
        # present in the target_node
        gen_load = doc_generator(self.key,
                                 self.num_items,
                                 self.new_docs_to_add,
                                 key_size=self.key_size,
                                 doc_size=self.doc_size,
                                 doc_type=self.doc_type,
                                 target_vbucket=target_vbuckets,
                                 vbuckets=self.cluster_util.vbuckets)

        if self.atomicity:
            task = self.task.async_load_gen_docs_atomicity(
                self.cluster,
                self.bucket_util.buckets,
                gen_load,
                "create",
                exp=0,
                batch_size=10,
                process_concurrency=self.process_concurrency,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout,
                update_count=self.update_count,
                transaction_timeout=self.transaction_timeout,
                commit=True,
                sync=self.sync)
        else:
            task = self.task.async_load_gen_docs(
                self.cluster,
                def_bucket,
                gen_load,
                "create",
                exp=0,
                batch_size=1,
                process_concurrency=8,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout,
                skip_read_on_error=True)

        # Induce the error condition
        error_sim.create(error_to_simulate)

        self.sleep(20, "Wait before reverting the error condition")
        # Revert the simulated error condition and close the ssh session
        error_sim.revert(error_to_simulate)
        remote.disconnect()

        # Wait for doc loading task to complete
        self.task.jython_task_manager.get_task_result(task)
        if not self.atomicity:
            if len(task.fail.keys()) != 0:
                if self.target_node == "active" or self.num_replicas in [2, 3]:
                    self.log_failure("Unwanted failures for keys: %s" %
                                     task.fail.keys())

            validate_passed = \
                self.durability_helper.validate_durability_exception(
                    task.fail,
                    SDKException.DurabilityAmbiguousException)
            if not validate_passed:
                self.log_failure("Unwanted exception seen during validation")

            # Create SDK connection for CRUD retries
            sdk_client = SDKClient([self.cluster.master], def_bucket)
            for doc_key, crud_result in task.fail.items():
                result = sdk_client.crud("create",
                                         doc_key,
                                         crud_result["value"],
                                         replicate_to=self.replicate_to,
                                         persist_to=self.persist_to,
                                         durability=self.durability_level,
                                         timeout=self.sdk_timeout)
                if result["status"] is False:
                    self.log_failure("Retry of doc_key %s failed: %s" %
                                     (doc_key, result["error"]))
            # Close the SDK connection
            sdk_client.close()

        # Update self.num_items
        self.num_items += self.new_docs_to_add

        if not self.atomicity:
            # Validate doc count
            self.bucket_util._wait_for_stats_all_buckets()
            self.bucket_util.verify_stats_all_buckets(self.num_items)

        self.validate_test_failure()
コード例 #11
0
    def test_timeout_with_successful_crud(self):
        """
        Test to make sure timeout is handled in durability calls
        and no documents are loaded when durability cannot be met using
        error simulation in server node side.

        This will validate failure in majority of nodes, where durability will
        surely fail for all CRUDs

        1. Select a node from the cluster to simulate the specified error
        2. Perform CRUD on the target bucket with given timeout
        3. Using cbstats to verify no operation succeeds
        4. Revert the error scenario from the cluster to resume durability
        5. Validate all mutations are succeeded after reverting
           the error condition

        Note: self.sdk_timeout values is considered as 'seconds'
        """

        shell_conn = dict()
        cbstat_obj = dict()
        error_sim = dict()
        doc_gen = dict()
        vb_info = dict()
        vb_info["init"] = dict()
        vb_info["afterCrud"] = dict()
        vb_info["withinTimeout"] = dict()

        target_nodes = self.getTargetNodes()
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(node)
            vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])

        doc_gen["insert"] = sub_doc_generator(self.key,
                                              self.num_items / 2,
                                              self.num_items,
                                              key_size=self.key_size,
                                              doc_size=self.sub_doc_size)
        doc_gen["read"] = sub_doc_generator(self.key,
                                            self.num_items / 4,
                                            self.num_items / 2,
                                            key_size=self.key_size)
        doc_gen["upsert"] = sub_doc_generator_for_edit(self.key,
                                                       self.num_items / 4,
                                                       self.num_items / 2,
                                                       key_size=self.key_size,
                                                       template_index=2)
        doc_gen["remove"] = sub_doc_generator_for_edit(self.key,
                                                       0,
                                                       self.num_items / 4,
                                                       key_size=self.key_size,
                                                       template_index=2)

        for op_type in doc_gen.keys():
            self.log.info("Performing '%s' with timeout=%s" %
                          (op_type, self.sdk_timeout))
            doc_load_task = self.task.async_load_gen_sub_docs(
                self.cluster,
                self.bucket,
                doc_gen[op_type],
                op_type,
                self.maxttl,
                path_create=True,
                batch_size=500,
                process_concurrency=8,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout)

            # Perform specified action
            for node in target_nodes:
                error_sim[node.ip].create(self.simulate_error,
                                          bucket_name=self.bucket.name)

            self.sleep(5, "Wait before reverting the error condition")

            # Revert the specified error scenario
            for node in target_nodes:
                error_sim[node.ip].revert(self.simulate_error,
                                          bucket_name=self.bucket.name)

            self.task_manager.get_task_result(doc_load_task)

            if len(doc_load_task.fail.keys()) != 0:
                if op_type == "read":
                    self.log.warning("Read failed for %d keys: %s" % (len(
                        doc_load_task.fail.keys()), doc_load_task.fail.keys()))
                else:
                    self.log_failure("Failures during %s operation: %s" %
                                     (op_type, doc_load_task.fail))

            # Fetch latest stats and validate the values are updated
            for node in target_nodes:
                if op_type == "read":
                    continue
                vb_info["afterCrud"][node.ip] = \
                    cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
                if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]:
                    self.log_failure(
                        "vbucket_seqno not updated. {0} == {1}".format(
                            vb_info["init"][node.ip],
                            vb_info["afterCrud"][node.ip]))

            # # Retry failed docs (if any)
            # retry_failed = self.durability_helper.retry_with_no_error(
            #     client, doc_load_task.fail, op_type)
            # if retry_failed:
            #     self.log_failure(msg.format(op_type))

        # Disconnect the shell connection
        for node in target_nodes:
            shell_conn[node.ip].disconnect()

        # Read mutation field from all docs for validation
        gen_read = sub_doc_generator_for_edit(self.key,
                                              0,
                                              self.num_items,
                                              key_size=self.key_size)
        gen_read.template = '{{ "mutated": "" }}'
        reader_task = self.task.async_load_gen_sub_docs(
            self.cluster,
            self.bucket,
            gen_read,
            "read",
            batch_size=50,
            process_concurrency=8,
            timeout_secs=self.sdk_timeout)
        self.task_manager.get_task_result(reader_task)

        len_failed_keys = len(reader_task.fail.keys())
        if len_failed_keys != 0:
            self.log_failure("Failures in read_task (%d): %s" %
                             (len_failed_keys, reader_task.fail.keys()))
        for doc_key, crud_result in reader_task.success.items():
            expected_val = 2
            if int(doc_key.split('-')[1]) >= self.num_items / 2:
                expected_val = 1
            if reader_task.success[doc_key]["value"][0] != expected_val:
                self.log_failure("Value mismatch for %s: %s" %
                                 (doc_key, crud_result))

        # Verify initial doc load count
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets)
        self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)

        self.validate_test_failure()
コード例 #12
0
ファイル: rollback_tests.py プロジェクト: sreebhargava143/TAF
    def test_rollback_n_times(self):
        doc_loading_task_2 = None
        ep_queue_size_map = dict()
        vb_replica_queue_size_map = dict()
        expected_num_items = \
            self.bucket_util.get_expected_total_num_items(self.bucket)

        keys_to_verify = ["max_visible_seqno",
                          "num_items",
                          "high_completed_seqno",
                          "purge_seqno"]

        if self.nodes_init < 2 or self.num_replicas < 1:
            self.fail("Not enough nodes/replicas to test rollback")

        # Fetch vbucket stats for validation
        self.get_vb_details_cbstats_for_all_nodes("pre_rollback")

        target_node = choice(self.kv_nodes)
        shell = self.node_shells[target_node]["shell"]
        error_sim = CouchbaseError(self.log, shell)
        cb_stats = self.node_shells[target_node]["cbstat"]
        self.target_vbuckets = cb_stats.vbucket_list(self.bucket.name)

        for _ in xrange(1, self.num_rollbacks + 1):
            self.total_rollback_items = 0
            error_sim.create(CouchbaseError.STOP_PERSISTENCE, self.bucket.name)
            doc_loading_task_1 = self.load_docs(self.doc_ops)

            if self.rollback_with_multiple_mutation:
                doc_loading_task_2 = self.load_docs("update")
            for node in self.cluster.nodes_in_cluster:
                ep_queue_size = 0
                if node.ip == target_node.ip:
                    ep_queue_size = self.total_rollback_items
                if self.sync_write_enabled:
                    # Includes prepare+commit mutation
                    ep_queue_size *= 2
                ep_queue_size_map.update({node: ep_queue_size})
                vb_replica_queue_size_map.update({node: 0})

            self.log.info("Validating stats")
            for bucket in self.bucket_util.buckets:
                self.bucket_util._wait_for_stat(bucket, ep_queue_size_map,
                                                timeout=self.wait_timeout)
                self.bucket_util._wait_for_stat(
                    bucket,
                    vb_replica_queue_size_map,
                    stat_name="vb_replica_queue_size",
                    timeout=self.wait_timeout)

            if self.rollback_with_multiple_mutation:
                self.__rewind_doc_index(doc_loading_task_2)
            self.__rewind_doc_index(doc_loading_task_1)

            error_sim.create(CouchbaseError.KILL_MEMCACHED)
            self.assertTrue(self.bucket_util._wait_warmup_completed(
                [target_node],
                self.bucket,
                wait_time=300))
            self.bucket_util.verify_stats_all_buckets(expected_num_items,
                                                      timeout=120)

            self.get_vb_details_cbstats_for_all_nodes("post_rollback")
            self.validate_seq_no_post_rollback("pre_rollback", "post_rollback",
                                               keys_to_verify)
            self.bucket_util.validate_docs_per_collections_all_buckets()

        self.validate_test_failure()
コード例 #13
0
    def test_bulk_sync_write_in_progress(self):
        doc_ops = self.input.param("doc_ops").split(';')
        shell_conn = dict()
        cbstat_obj = dict()
        error_sim = dict()
        vb_info = dict()
        active_vbs = dict()
        replica_vbs = dict()
        sync_write_in_progress = \
            SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS

        # Override d_level, error_simulation type based on d_level
        self.__get_d_level_and_error_to_simulate()

        target_nodes = DurabilityHelper.getTargetNodes(self.cluster,
                                                       self.nodes_init,
                                                       self.num_nodes_affected)
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(node)
            vb_info["init"] = dict()
            vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])
            # Fetch affected nodes' vb_num which are of type=replica
            active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list(
                self.bucket.name, vbucket_type="active")
            replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list(
                self.bucket.name, vbucket_type="replica")

        target_vbs = replica_vbs
        if self.durability_level \
                == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE:
            target_vbs = active_vbs
            target_vbuckets = list()
            for target_node in target_nodes:
                target_vbuckets += target_vbs[target_node.ip]
        else:
            target_vbuckets = target_vbs[target_nodes[0].ip]
            if len(target_nodes) > 1:
                index = 1
                while index < len(target_nodes):
                    target_vbuckets = list(
                        set(target_vbuckets).intersection(
                            set(target_vbs[target_nodes[index].ip])))
                    index += 1

        doc_load_spec = dict()
        doc_load_spec["doc_crud"] = dict()
        doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets
        doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level
        doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5
        doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all"
        doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60
        doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \
            = "test_collections"

        if doc_ops[0] == "create":
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1
        elif doc_ops[0] == "update":
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1
        elif doc_ops[0] == "replace":
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.REPLACE_PERCENTAGE_PER_COLLECTION] = 1
        elif doc_ops[0] == "delete":
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1

        # Induce error condition for testing
        for node in target_nodes:
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=self.bucket.name)

        doc_loading_task = \
            self.bucket_util.run_scenario_from_spec(
                self.task,
                self.cluster,
                self.cluster.buckets,
                doc_load_spec,
                async_load=True)

        self.sleep(5, "Wait for doc ops to reach server")

        tem_durability = self.durability_level
        if self.with_non_sync_writes:
            tem_durability = "NONE"

        for bucket, s_dict in doc_loading_task.loader_spec.items():
            for s_name, c_dict in s_dict["scopes"].items():
                for c_name, c_meta in c_dict["collections"].items():
                    for op_type in c_meta:
                        # This will support both sync-write and non-sync-writes
                        doc_loader_task_2 = self.task.async_load_gen_docs(
                            self.cluster,
                            self.bucket,
                            c_meta[op_type]["doc_gen"],
                            doc_ops[1],
                            0,
                            scope=s_name,
                            collection=c_name,
                            sdk_client_pool=self.sdk_client_pool,
                            batch_size=self.crud_batch_size,
                            process_concurrency=1,
                            replicate_to=self.replicate_to,
                            persist_to=self.persist_to,
                            durability=tem_durability,
                            timeout_secs=3,
                            print_ops_rate=False,
                            skip_read_on_error=True,
                            task_identifier="parallel_task2")
                        self.task.jython_task_manager.get_task_result(
                            doc_loader_task_2)

                        # Validation to verify the sync_in_write_errors
                        # in doc_loader_task_2
                        failed_docs = doc_loader_task_2.fail
                        if len(failed_docs.keys()) != 1:
                            self.log_failure(
                                "Exception not seen for docs: %s" %
                                failed_docs)

                        valid_exception = self.durability_helper\
                            .validate_durability_exception(
                                failed_docs,
                                SDKException.AmbiguousTimeoutException,
                                retry_reason=sync_write_in_progress)

                        if not valid_exception:
                            self.log_failure("Got invalid exception")

        # Revert the introduced error condition
        for node in target_nodes:
            error_sim[node.ip].revert(self.simulate_error,
                                      bucket_name=self.bucket.name)

        # Wait for doc_loading to complete
        self.task_manager.get_task_result(doc_loading_task)
        self.bucket_util.validate_doc_loading_results(doc_loading_task)
        if doc_loading_task.result is False:
            self.log_failure("Doc CRUDs failed")

        # Validate docs for update success or not
        if doc_ops[0] == "update":
            for bucket, s_dict in doc_loading_task.loader_spec.items():
                for s_name, c_dict in s_dict["scopes"].items():
                    for c_name, c_meta in c_dict["collections"].items():
                        for op_type in c_meta:
                            read_task = self.task.async_load_gen_docs(
                                self.cluster,
                                self.bucket,
                                c_meta[op_type]["doc_gen"],
                                "read",
                                batch_size=self.crud_batch_size,
                                process_concurrency=1,
                                timeout_secs=self.sdk_timeout)
                            self.task_manager.get_task_result(read_task)
                            for key, doc_info in read_task.success.items():
                                if doc_info["cas"] != 0 \
                                        and json.loads(str(doc_info["value"])
                                                       )["mutated"] != 1:
                                    self.log_failure(
                                        "Update failed for key %s: %s" %
                                        (key, doc_info))

        # Validate doc_count per collection
        self.validate_test_failure()
        self.bucket_util.validate_docs_per_collections_all_buckets(
            self.cluster)
コード例 #14
0
    def test_durability_abort(self):
        """
        Test to validate durability abort is triggered properly with proper
        rollback on active vbucket
        :return:
        """
        load_task = dict()

        # Override d_level, error_simulation type based on d_level
        self.__get_d_level_and_error_to_simulate()

        kv_nodes = self.cluster_util.get_kv_nodes(self.cluster)
        for server in kv_nodes:
            ssh_shell = RemoteMachineShellConnection(server)
            cbstats = Cbstats(server)
            cb_err = CouchbaseError(self.log, ssh_shell)
            target_vb_type = "replica"
            if self.durability_level \
                    == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE:
                target_vb_type = "active"
            target_vbs = cbstats.vbucket_list(self.bucket.name, target_vb_type)
            doc_load_spec = dict()
            doc_load_spec["doc_crud"] = dict()
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 2
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 2
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 2

            doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \
                = "test_collections"
            doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbs

            doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] \
                = self.durability_level
            doc_load_spec[MetaCrudParams.RETRY_EXCEPTIONS] = [
                SDKException.DurabilityAmbiguousException
            ]
            doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 2
            doc_load_spec[MetaCrudParams.SKIP_READ_ON_ERROR] = True
            doc_load_spec[MetaCrudParams.SUPPRESS_ERROR_TABLE] = True

            cb_err.create(self.simulate_error, self.cluster.buckets[0].name)
            load_task[server] = \
                self.bucket_util.run_scenario_from_spec(
                    self.task,
                    self.cluster,
                    self.cluster.buckets,
                    doc_load_spec,
                    batch_size=1,
                    validate_task=False)
            cb_err.revert(self.simulate_error, self.cluster.buckets[0].name)
            ssh_shell.disconnect()
        self.validate_test_failure()

        failed = self.durability_helper.verify_vbucket_details_stats(
            self.bucket,
            kv_nodes,
            vbuckets=self.cluster.vbuckets,
            expected_val=self.verification_dict)
        if failed:
            self.log_failure("Cbstat vbucket-details verification failed "
                             "after aborts")
        self.validate_test_failure()

        # Retry aborted keys with healthy cluster
        self.log.info("Performing CRUDs on healthy cluster")
        for server in kv_nodes:
            self.bucket_util.validate_doc_loading_results(load_task[server])
            if load_task[server].result is False:
                self.log_failure("Doc retry task failed on %s" % server.ip)

            # Update cbstat vb-details verification counters
            for bucket, s_dict in load_task[server].loader_spec.items():
                for s_name, c_dict in s_dict["scopes"].items():
                    for c_name, _ in c_dict["collections"].items():
                        c_crud_data = load_task[server].loader_spec[bucket][
                            "scopes"][s_name]["collections"][c_name]
                        for op_type in c_crud_data.keys():
                            total_mutation = \
                                c_crud_data[op_type]["doc_gen"].end \
                                - c_crud_data[op_type]["doc_gen"].start
                            if op_type in DocLoading.Bucket.DOC_OPS:
                                self.verification_dict["ops_%s" % op_type] \
                                    += total_mutation
                                self.verification_dict[
                                    "sync_write_committed_count"] \
                                    += total_mutation
            failed = self.durability_helper.verify_vbucket_details_stats(
                self.bucket,
                self.cluster_util.get_kv_nodes(self.cluster),
                vbuckets=self.cluster.vbuckets,
                expected_val=self.verification_dict)
            if failed:
                self.log_failure("Cbstat vbucket-details verification "
                                 "failed after ops on server: %s" % server.ip)
        self.validate_test_failure()
コード例 #15
0
    def test_maxttl_with_timeout(self):
        """
        1. Stop Memcached on target_nodes based on replicas configured.
        2. Initiate doc_ops with higher sdk_timeout
        3. Sleep for time within the configured sdk_timeout
        4. Resume Memcached on target_nodes to make sure doc_ops go through
        5. Make sure maxTTL is calculated as soon as the active vbucket
           receives the mutation
        :return:
        """
        shell_conn = dict()
        target_vbuckets = list()
        target_nodes = self.getTargetNodes()
        def_bucket = self.cluster.buckets[0]
        self.maxttl = self.input.param("doc_ttl", self.maxttl)

        # Open required SDK connections before error_simulation
        gen_create = doc_generator(self.key,
                                   0,
                                   self.num_items,
                                   doc_size=self.doc_size,
                                   doc_type=self.doc_type,
                                   target_vbucket=target_vbuckets,
                                   vbuckets=self.cluster.vbuckets)
        doc_op_task = self.task.async_load_gen_docs(
            self.cluster,
            def_bucket,
            gen_create,
            "create",
            self.maxttl,
            batch_size=10,
            process_concurrency=8,
            replicate_to=self.replicate_to,
            persist_to=self.persist_to,
            durability=self.durability_level,
            timeout_secs=self.sdk_timeout,
            compression=self.sdk_compression,
            start_task=False,
            sdk_client_pool=self.sdk_client_pool)

        # Open shell_conn and create Memcached error for testing MaxTTL
        self.log.info("1. Stopping Memcached on target_nodes")
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstats = Cbstats(shell_conn[node.ip])
            target_vbuckets += cbstats.vbucket_list(def_bucket.name, "replica")
            cb_error = CouchbaseError(self.log, shell_conn[node.ip])
            cb_error.create(CouchbaseError.STOP_MEMCACHED, def_bucket.name)

        self.log.info("2. Initiating the doc_ops with doc TTL")
        self.task_manager.add_new_task(doc_op_task)

        self.sleep(self.maxttl, "3. Sleep for max_ttl time")

        # Revert Memcached error and close the shell_conn
        self.log.info("4. Resuming Memcached on target_nodes")
        for node in target_nodes:
            cb_error = CouchbaseError(self.log, shell_conn[node.ip])
            cb_error.revert(CouchbaseError.STOP_MEMCACHED, def_bucket.name)
            shell_conn[node.ip].disconnect()

        self.log.info("5. Waiting for doc_ops to complete")
        self.task.jython_task_manager.get_task_result(doc_op_task)

        self.bucket_util._expiry_pager(self.cluster, val=1)
        self.sleep(10, "6. Waiting for items to be purged")

        # Read all expired docs to validate all keys present
        doc_op_task = self.task.async_load_gen_docs(
            self.cluster,
            def_bucket,
            gen_create,
            "read",
            batch_size=10,
            process_concurrency=8,
            timeout_secs=self.sdk_timeout,
            sdk_client_pool=self.sdk_client_pool)
        self.task.jython_task_manager.get_task_result(doc_op_task)

        self.log.info("7. Validating docs expired after TTL, "
                      "even before sync_write succeeds")
        if len(doc_op_task.success.keys()) == self.num_items:
            self.fail("No docs deleted after MaxTTL time: %s" %
                      doc_op_task.success.keys())

        self.sleep(10, "8. Waiting for all docs to be purged")
        # Read all expired docs to validate all keys present
        doc_op_task = self.task.async_load_gen_docs(
            self.cluster,
            def_bucket,
            gen_create,
            "read",
            batch_size=10,
            process_concurrency=8,
            timeout_secs=self.sdk_timeout,
            sdk_client_pool=self.sdk_client_pool)
        self.task.jython_task_manager.get_task_result(doc_op_task)

        self.log.info("9. Validating docs expired after TTL")
        if len(doc_op_task.fail.keys()) != self.num_items:
            self.fail("Items not deleted after MaxTTL time: %s" %
                      doc_op_task.success.keys())

        # Validate cas for purged items
        keys_with_cas = list()
        for key, result in doc_op_task.fail.items():
            if result['cas'] != 0:
                keys_with_cas.append(key)
        if len(keys_with_cas) != 0:
            self.fail("Following failed keys has CAS: %s" % keys_with_cas)

        # Recreate all docs without any node issues
        doc_op_task = self.task.async_load_gen_docs(
            self.cluster,
            def_bucket,
            gen_create,
            "create",
            0,
            batch_size=10,
            process_concurrency=8,
            durability=self.durability_level,
            timeout_secs=self.sdk_timeout,
            compression=self.sdk_compression,
            sdk_client_pool=self.sdk_client_pool)
        self.task.jython_task_manager.get_task_result(doc_op_task)

        self.log.info("10. Validating docs exists after creation")
        if len(doc_op_task.fail.keys()) != 0:
            self.fail("Doc recreate failed for keys: %s" %
                      doc_op_task.fail.keys())

        # Final doc_count validation
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets)
        self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)
コード例 #16
0
    def test_sub_doc_with_process_crash(self):
        """
        Test to make sure durability will succeed even if a node goes down
        due to crash and has enough nodes to satisfy the durability

        1. Select a node from the cluster to simulate the specified error
        2. Perform CRUD on the target bucket with given timeout
        3. Using cbstats to verify the operation succeeds
        4. Validate all mutations are succeeded

        Note: self.sdk_timeout values is considered as 'seconds'
        """
        if self.num_replicas < 2:
            self.assertTrue(False, msg="Required: num_replicas > 1")

        # Override num_of_nodes affected to 1
        self.num_nodes_affected = 1

        error_sim = dict()
        shell_conn = dict()
        cbstat_obj = dict()
        failover_info = dict()
        vb_info_info = dict()
        active_vbs_in_target_nodes = list()
        failover_info["init"] = dict()
        failover_info["afterCrud"] = dict()
        vb_info_info["init"] = dict()
        vb_info_info["afterCrud"] = dict()
        def_bucket = self.bucket_util.buckets[0]

        self.load_data_for_sub_doc_ops()

        self.log.info("Selecting nodes to simulate error condition")
        target_nodes = DurabilityHelper.getTargetNodes(self.cluster,
                                                       self.nodes_init,
                                                       self.num_nodes_affected)

        self.log.info("Will simulate error condition on %s" % target_nodes)
        for node in target_nodes:
            # Create shell_connections
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip])
            active_vbs = cbstat_obj[node.ip].vbucket_list(
                def_bucket.name, "active")
            active_vbs_in_target_nodes += active_vbs
            vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                def_bucket.name)
            failover_info["init"][node.ip] = \
                cbstat_obj[node.ip].failover_stats(def_bucket.name)

            # Remove active vbuckets from doc_loading to avoid errors

        load_spec = dict()
        # load_spec["target_vbuckets"] = list(set(target_vbuckets)
        #                                    ^ set(active_vbs_in_target_nodes))
        load_spec["doc_crud"] = dict()
        load_spec["subdoc_crud"] = dict()
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 10
        load_spec["subdoc_crud"][
            MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 50
        load_spec["subdoc_crud"][
            MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 25
        load_spec["subdoc_crud"][
            MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 25

        self.log.info("Perform 'create', 'update', 'delete' mutations")

        doc_loading_task = \
            self.bucket_util.run_scenario_from_spec(
                self.task,
                self.cluster,
                self.bucket_util.buckets,
                load_spec,
                mutation_num=1,
                async_load=True)

        self.sleep(5, "Wait for doc loaders to start loading data")

        for node in target_nodes:
            # Perform specified action
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=def_bucket.name)

        # Perform new scope/collection creation during doc ops in parallel
        self.__perform_collection_crud(mutation_num=2)

        # Wait for document_loader tasks to complete
        self.task_manager.get_task_result(doc_loading_task)
        self.bucket_util.validate_doc_loading_results(doc_loading_task)
        if doc_loading_task.result is False:
            self.log_failure("Sub_doc CRUDs failed with process crash")

        # Revert the induced error condition
        for node in target_nodes:
            error_sim[node.ip].revert(self.simulate_error,
                                      bucket_name=def_bucket.name)

        # Fetch latest failover stats and validate the values are updated
        self.log.info("Validating failover and seqno cbstats")
        for node in target_nodes:
            vb_info_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(def_bucket.name)
            failover_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].failover_stats(def_bucket.name)

            # Failover validation
            val = \
                failover_info["init"][node.ip] \
                == failover_info["afterCrud"][node.ip]
            error_msg = "Failover stats not updated after error condition"
            self.assertTrue(val, msg=error_msg)

            # Seq_no validation (High level)
            val = \
                vb_info_info["init"][node.ip] \
                != vb_info_info["afterCrud"][node.ip]
            self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs")

        # Disconnect the shell connection
        for node in target_nodes:
            shell_conn[node.ip].disconnect()

        self.validate_test_failure()
        # Doc count validation
        self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.validate_docs_per_collections_all_buckets()
コード例 #17
0
    def validate_durability_with_crud(
            self, bucket, bucket_durability,
            verification_dict,
            doc_start_index=0,
            num_items_to_load=1, op_type="create",
            doc_durability=Bucket.DurabilityLevel.NONE):
        """
        Common API to validate durability settings of the bucket is set
        correctly or not.

        :param bucket: Bucket object to validate
        :param bucket_durability: Durability set for the bucket
                                  Note: Need this because the string within the
                                        bucket object is different than this.
        :param verification_dict: To hold the values for req cbstats to verify
        :param doc_start_index: Starting index to be considered for doc_load
        :param num_items_to_load: Number of items to be loaded to test.
                                  Default is '1'
        :param op_type: Type of CRUD to perform. Default is 'create'
        :param doc_durability: Document durability level to use during CRUD.
                               Default level is 'None'
        :return:
        """
        def get_d_level_used():
            if self.d_level_order.index(bucket_durability) \
                    < self.d_level_order.index(doc_durability):
                return doc_durability
            return bucket_durability

        d_level_to_test = get_d_level_used()
        # Nothing to test for durability_level=None (async_write case)
        if d_level_to_test == Bucket.DurabilityLevel.NONE:
            return

        self.log.info("Performing %s operation to validate d_level %s"
                      % (op_type, d_level_to_test))

        # Can't simulate error conditions for all durability_levels.
        # So only perform CRUD without error_sim
        if len(self.vbs_in_node.keys()) > 1:
            # Pick a random node to perform error sim and load
            random_node = choice(self.vbs_in_node.keys())

            target_vb_type, simulate_error = \
                self.durability_helper.get_vb_and_error_type(d_level_to_test)

            doc_gen = doc_generator(
                self.key, doc_start_index, num_items_to_load,
                target_vbucket=self.vbs_in_node[random_node][target_vb_type])
            error_sim = CouchbaseError(self.log,
                                       self.vbs_in_node[random_node]["shell"])

            doc_load_task = self.task.async_load_gen_docs(
                self.cluster, bucket, doc_gen, op_type,
                exp=self.maxttl,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=doc_durability,
                timeout_secs=32,
                batch_size=1,
                skip_read_on_error=True,
                suppress_error_table=True,
                start_task=False,
                sdk_client_pool=self.sdk_client_pool)

            self.sleep(5, "Wait for sdk_client to get warmed_up")
            # Simulate target error condition
            error_sim.create(simulate_error)
            self.sleep(5, "Wait for error_sim to take effect")

            # Start doc_loading task and wait for it to complete
            self.task_manager.add_new_task(doc_load_task)
            self.task_manager.get_task_result(doc_load_task)

            # Revert the induced error condition
            self.sleep(5, "Wait before reverting error_simulation")
            error_sim.revert(simulate_error)

            # Validate failed doc count and exception type from SDK
            if not doc_load_task.fail.keys():
                self.log_failure("Docs inserted without honoring the "
                                 "bucket durability level")
            for key, result in doc_load_task.fail.items():
                if SDKException.DurabilityAmbiguousException \
                        not in str(result["error"]):
                    self.log_failure("Invalid exception for key %s "
                                     "during %s operation: %s"
                                     % (key, op_type, result["error"]))

            verification_dict["sync_write_aborted_count"] += num_items_to_load
        else:
            doc_gen = doc_generator(self.key, doc_start_index,
                                    doc_start_index+num_items_to_load)

        # Retry the same CRUDs without any error simulation in place
        doc_load_task = self.task.async_load_gen_docs(
            self.cluster, bucket, doc_gen, op_type,
            exp=self.maxttl,
            durability=doc_durability,
            timeout_secs=2,
            batch_size=1,
            sdk_client_pool=self.sdk_client_pool)
        self.task_manager.get_task_result(doc_load_task)
        if doc_load_task.fail:
            self.log_failure("Failures seen during CRUD without "
                             "error simulation. Keys failed: %s"
                             % doc_load_task.fail.keys())
        else:
            verification_dict["ops_%s" % op_type] += \
                num_items_to_load
            verification_dict["sync_write_committed_count"] += \
                num_items_to_load
コード例 #18
0
    def test_update_durability_between_doc_op(self):
        """
        1. Create Bucket with durability level set.
        2. Bring down a node such that durability CRUD will wait
        3. Perform doc_op and update bucket_level_durability
        4. Revert scenario induced in step#2, such that doc_op will complete
        5. Make sure doc_ops in step#3 went through using prev. d-level
        """
        # Starting from max_durability levels because to iterate
        # all lower levels for doc_ops with level update
        supported_d_levels = deepcopy(self.d_level_order)
        if self.bucket_type == Bucket.Type.EPHEMERAL:
            supported_d_levels = supported_d_levels[0:2]

        supported_d_levels.reverse()
        supported_d_levels += [supported_d_levels[0]]

        create_desc = "Creating %s bucket with level '%s'" \
                      % (self.bucket_type, supported_d_levels[0])

        self.log.info(create_desc)
        bucket_dict = self.get_bucket_dict(self.bucket_type,
                                           supported_d_levels[0])
        # Object to support performing CRUDs and create Bucket
        bucket_obj = Bucket(bucket_dict)
        self.bucket_util.create_bucket(self.cluster, bucket_obj,
                                       wait_for_warmup=True)
        self.get_vbucket_type_mapping(bucket_obj.name)
        self.summary.add_step(create_desc)

        self.bucket_util.print_bucket_stats(self.cluster)

        # Loop to update all other durability levels
        prev_d_level = supported_d_levels[0]
        for bucket_durability in supported_d_levels[1:]:
            target_vb_type, simulate_error = \
                self.durability_helper.get_vb_and_error_type(bucket_durability)

            # Pick a random node to perform error sim and load
            random_node = choice(self.vbs_in_node.keys())
            error_sim = CouchbaseError(
                self.log,
                self.vbs_in_node[random_node]["shell"])

            target_vbs = self.vbs_in_node[random_node][target_vb_type]
            doc_gen = doc_generator(self.key, 0, 1,
                                    target_vbucket=target_vbs)

            doc_load_task = self.task.async_load_gen_docs(
                self.cluster, bucket_obj, doc_gen, "update",
                durability=Bucket.DurabilityLevel.NONE,
                timeout_secs=60,
                start_task=False,
                sdk_client_pool=self.sdk_client_pool)

            # Simulate target error condition
            error_sim.create(simulate_error)
            self.sleep(5, "Wait before starting doc_op")
            self.task_manager.add_new_task(doc_load_task)

            new_d_level = BucketDurability[bucket_durability]
            self.sleep(5, "Wait before updating bucket level "
                          "durability=%s" % new_d_level)

            self.bucket_util.update_bucket_property(
                self.cluster.master,
                bucket_obj,
                bucket_durability=new_d_level)
            self.bucket_util.print_bucket_stats(self.cluster)

            buckets = self.bucket_util.get_all_buckets(self.cluster)
            if buckets[0].durability_level != new_d_level:
                self.log_failure("Failed to update bucket_d_level to %s"
                                 % new_d_level)
            self.summary.add_step("Set bucket-durability=%s" % new_d_level)

            if prev_d_level == Bucket.DurabilityLevel.NONE:
                if not doc_load_task.completed:
                    self.log_failure("Doc-op still pending for d_level 'NONE'")
            elif doc_load_task.completed:
                self.log_failure("Doc-op completed before reverting the "
                                 "error condition: %s" % simulate_error)

            # Revert the induced error condition
            error_sim.revert(simulate_error)

            self.task_manager.get_task_result(doc_load_task)
            if doc_load_task.fail:
                self.log_failure("Doc_op failed")
            self.summary.add_step("Doc_op with previous d_level %s"
                                  % prev_d_level)
            prev_d_level = bucket_durability

        # Delete the bucket on server
        self.bucket_util.delete_bucket(self.cluster, bucket_obj)
        self.summary.add_step("Delete %s bucket" % self.bucket_type)
コード例 #19
0
    def test_sync_write_in_progress(self):
        doc_ops = self.input.param("doc_ops", "create;create").split(';')
        shell_conn = dict()
        cbstat_obj = dict()
        error_sim = dict()
        vb_info = dict()
        active_vbs = dict()
        replica_vbs = dict()

        # Override d_level, error_simulation type based on d_level
        self.__get_d_level_and_error_to_simulate()

        # Acquire SDK client from the pool for performing doc_ops locally
        client = SDKClient([self.cluster.master], self.bucket)

        target_nodes = DurabilityHelper.getTargetNodes(self.cluster,
                                                       self.nodes_init,
                                                       self.num_nodes_affected)
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(node)
            vb_info["init"] = dict()
            vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])
            # Fetch affected nodes' vb_num which are of type=replica
            active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list(
                self.bucket.name, vbucket_type="active")
            replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list(
                self.bucket.name, vbucket_type="replica")

        if self.durability_level \
                == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE:
            target_vbs = active_vbs
            target_vbuckets = list()
            for target_node in target_nodes:
                target_vbuckets += target_vbs[target_node.ip]
        else:
            target_vbuckets = replica_vbs[target_nodes[0].ip]
            if len(target_nodes) > 1:
                index = 1
                while index < len(target_nodes):
                    target_vbuckets = list(
                        set(target_vbuckets).intersection(
                            set(replica_vbs[target_nodes[index].ip])))
                    index += 1

        doc_load_spec = dict()
        doc_load_spec["doc_crud"] = dict()
        doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \
            = "test_collections"
        doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets
        doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5
        doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all"
        doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level
        doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60

        if doc_ops[0] == DocLoading.Bucket.DocOps.CREATE:
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1
        elif doc_ops[0] == DocLoading.Bucket.DocOps.UPDATE:
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1
        elif doc_ops[0] == DocLoading.Bucket.DocOps.REPLACE:
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.REPLACE_PERCENTAGE_PER_COLLECTION] = 1
        elif doc_ops[0] == DocLoading.Bucket.DocOps.DELETE:
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1

        # Induce error condition for testing
        for node in target_nodes:
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=self.bucket.name)
            self.sleep(3, "Wait for error simulation to take effect")

        doc_loading_task = \
            self.bucket_util.run_scenario_from_spec(
                self.task,
                self.cluster,
                self.cluster.buckets,
                doc_load_spec,
                async_load=True)

        self.sleep(5, "Wait for doc ops to reach server")

        for bucket, s_dict in doc_loading_task.loader_spec.items():
            for s_name, c_dict in s_dict["scopes"].items():
                for c_name, c_meta in c_dict["collections"].items():
                    client.select_collection(s_name, c_name)
                    for op_type in c_meta:
                        key, value = c_meta[op_type]["doc_gen"].next()
                        if self.with_non_sync_writes:
                            fail = client.crud(doc_ops[1],
                                               key,
                                               value,
                                               exp=0,
                                               timeout=2,
                                               time_unit="seconds")
                        else:
                            fail = client.crud(
                                doc_ops[1],
                                key,
                                value,
                                exp=0,
                                durability=self.durability_level,
                                timeout=2,
                                time_unit="seconds")

                        expected_exception = \
                            SDKException.AmbiguousTimeoutException
                        retry_reason = \
                            SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS
                        if doc_ops[0] == DocLoading.Bucket.DocOps.CREATE \
                                and doc_ops[1] in \
                                [DocLoading.Bucket.DocOps.DELETE,
                                 DocLoading.Bucket.DocOps.REPLACE]:
                            expected_exception = \
                                SDKException.DocumentNotFoundException
                            retry_reason = None

                        # Validate the returned error from the SDK
                        if expected_exception not in str(fail["error"]):
                            self.log_failure("Invalid exception for %s: %s" %
                                             (key, fail["error"]))
                        if retry_reason \
                                and retry_reason not in str(fail["error"]):
                            self.log_failure(
                                "Invalid retry reason for %s: %s" %
                                (key, fail["error"]))

                        # Try reading the value in SyncWrite state
                        fail = client.crud("read", key)
                        if doc_ops[0] == "create":
                            # Expected KeyNotFound in case of CREATE op
                            if fail["status"] is True:
                                self.log_failure(
                                    "%s returned value during SyncWrite %s" %
                                    (key, fail))
                        else:
                            # Expects prev val in case of other operations
                            if fail["status"] is False:
                                self.log_failure(
                                    "Key %s read failed for prev value: %s" %
                                    (key, fail))

        # Revert the introduced error condition
        for node in target_nodes:
            error_sim[node.ip].revert(self.simulate_error,
                                      bucket_name=self.bucket.name)

        # Wait for doc_loading to complete
        self.task_manager.get_task_result(doc_loading_task)
        self.bucket_util.validate_doc_loading_results(doc_loading_task)
        if doc_loading_task.result is False:
            self.log_failure("Doc CRUDs failed")

        # Release the acquired SDK client
        client.close()
        self.validate_test_failure()
コード例 #20
0
        def test_scenario(bucket, doc_ops,
                          with_sync_write_val=None):
            # Set crud_batch_size
            crud_batch_size = 4
            simulate_error = CouchbaseError.STOP_MEMCACHED

            # Fetch target_vbs for CRUDs
            node_vb_info = self.vbs_in_node
            target_vbuckets = node_vb_info[target_nodes[0]]["replica"]
            if len(target_nodes) > 1:
                index = 1
                while index < len(target_nodes):
                    target_vbuckets = list(
                        set(target_vbuckets).intersection(
                            set(node_vb_info[target_nodes[index]]["replica"]))
                    )
                    index += 1

            # Variable to hold one of the doc_generator objects
            gen_loader_1 = None
            gen_loader_2 = None

            # Initialize doc_generators to use for testing
            self.log.info("Creating doc_generators")
            gen_create = doc_generator(
                self.key, self.num_items, crud_batch_size,
                vbuckets=self.cluster.vbuckets,
                target_vbucket=target_vbuckets)
            gen_update = doc_generator(
                self.key, 0, crud_batch_size,
                vbuckets=self.cluster.vbuckets,
                target_vbucket=target_vbuckets, mutate=1)
            gen_delete = doc_generator(
                self.key, 0, crud_batch_size,
                vbuckets=self.cluster.vbuckets,
                target_vbucket=target_vbuckets)
            self.log.info("Done creating doc_generators")

            # Start CRUD operation based on the given 'doc_op' type
            if doc_ops[0] == "create":
                self.num_items += crud_batch_size
                gen_loader_1 = gen_create
            elif doc_ops[0] in ["update", "replace", "touch"]:
                gen_loader_1 = gen_update
            elif doc_ops[0] == "delete":
                gen_loader_1 = gen_delete
                self.num_items -= crud_batch_size

            if doc_ops[1] == "create":
                gen_loader_2 = gen_create
            elif doc_ops[1] in ["update", "replace", "touch"]:
                gen_loader_2 = gen_update
            elif doc_ops[1] == "delete":
                gen_loader_2 = gen_delete

            # Load required docs for doc_op_1 in case of type != create
            if doc_op[2] == "load_initial_docs":
                doc_loading_task = self.task.async_load_gen_docs(
                    self.cluster, bucket, gen_loader_1, "create", 0,
                    batch_size=crud_batch_size, process_concurrency=1,
                    timeout_secs=10,
                    print_ops_rate=False,
                    sdk_client_pool=self.sdk_client_pool)
                self.task_manager.get_task_result(doc_loading_task)
                if doc_loading_task.fail:
                    self.log_failure("Failure while loading initial docs")
                self.summary.add_step("Create docs for %s" % doc_op[0])
                verification_dict["ops_create"] += crud_batch_size
                verification_dict["sync_write_committed_count"] \
                    += crud_batch_size

            # Initialize tasks and store the task objects
            doc_loader_task = self.task.async_load_gen_docs(
                self.cluster, bucket, gen_loader_1, doc_ops[0], 0,
                batch_size=crud_batch_size, process_concurrency=8,
                timeout_secs=60,
                print_ops_rate=False,
                start_task=False,
                sdk_client_pool=self.sdk_client_pool)

            # SDK client for performing individual ops
            client = SDKClient([self.cluster.master], bucket)

            # Perform specified action
            for node in target_nodes:
                error_sim = CouchbaseError(self.log,
                                           self.vbs_in_node[node]["shell"])
                error_sim.create(simulate_error,
                                 bucket_name=bucket.name)
            self.sleep(5, "Wait for error simulation to take effect")

            self.task_manager.add_new_task(doc_loader_task)
            self.sleep(5, "Wait for task_1 CRUDs to reach server")

            # Perform specified CRUD operation on sync_write docs
            tem_gen = deepcopy(gen_loader_2)
            while tem_gen.has_next():
                key, value = tem_gen.next()
                for retry_strategy in [
                        SDKConstants.RetryStrategy.FAIL_FAST,
                        SDKConstants.RetryStrategy.BEST_EFFORT]:
                    if with_sync_write_val:
                        fail = client.crud(doc_ops[1], key, value=value,
                                           exp=0,
                                           durability=with_sync_write_val,
                                           timeout=3, time_unit="seconds",
                                           sdk_retry_strategy=retry_strategy)
                    else:
                        fail = client.crud(doc_ops[1], key, value=value,
                                           exp=0,
                                           timeout=3, time_unit="seconds",
                                           sdk_retry_strategy=retry_strategy)

                    expected_exception = SDKException.AmbiguousTimeoutException
                    retry_reason = \
                        SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS
                    if retry_strategy == SDKConstants.RetryStrategy.FAIL_FAST:
                        expected_exception = \
                            SDKException.RequestCanceledException
                        retry_reason = \
                            SDKException.RetryReason \
                            .KV_SYNC_WRITE_IN_PROGRESS_NO_MORE_RETRIES

                    # Validate the returned error from the SDK
                    if expected_exception not in str(fail["error"]):
                        self.log_failure("Invalid exception for {0}: {1}"
                                         .format(key, fail["error"]))
                    if retry_reason not in str(fail["error"]):
                        self.log_failure("Invalid retry reason for {0}: {1}"
                                         .format(key, fail["error"]))

                    # Try reading the value in SyncWrite in-progress state
                    fail = client.crud("read", key)
                    if doc_ops[0] == "create":
                        # Expected KeyNotFound in case of CREATE operation
                        if fail["status"] is True:
                            self.log_failure(
                                "%s returned value during SyncWrite state: %s"
                                % (key, fail))
                    else:
                        # Expects prev value in case of other operations
                        if fail["status"] is False:
                            self.log_failure(
                                "Key %s read failed for previous value: %s"
                                % (key, fail))

            # Revert the introduced error condition
            for node in target_nodes:
                error_sim = CouchbaseError(self.log,
                                           self.vbs_in_node[node]["shell"])
                error_sim.revert(simulate_error,
                                 bucket_name=bucket.name)

            # Wait for doc_loader_task to complete
            self.task.jython_task_manager.get_task_result(doc_loader_task)

            verification_dict["ops_%s" % doc_op[0]] += crud_batch_size
            verification_dict["sync_write_committed_count"] \
                += crud_batch_size

            # Disconnect the client
            client.close()
コード例 #21
0
    def test_sub_doc_sync_write_in_progress(self):
        """
        Test to simulate sync_write_in_progress error and validate the behavior
        This will validate failure in majority of nodes, where durability will
        surely fail for all CRUDs

        1. Select nodes to simulate the error which will affect the durability
        2. Enable the specified error_scenario on the selected nodes
        3. Perform individual CRUDs and verify sync_write_in_progress errors
        4. Validate the end results
        """

        doc_ops = self.input.param("doc_ops", "insert")

        shell_conn = dict()
        cbstat_obj = dict()
        error_sim = dict()
        vb_info = dict()
        active_vbs = dict()
        replica_vbs = dict()
        vb_info["init"] = dict()
        doc_load_spec = dict()

        # Override d_level, error_simulation type based on d_level
        self.__get_d_level_and_error_to_simulate()

        target_nodes = DurabilityHelper.getTargetNodes(self.cluster,
                                                       self.nodes_init,
                                                       self.num_nodes_affected)
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(node)
            vb_info["init"] = dict()
            vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])
            # Fetch affected nodes' vb_num which are of type=replica
            active_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list(
                self.bucket.name, vbucket_type="active")
            replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list(
                self.bucket.name, vbucket_type="replica")

        target_vbs = replica_vbs
        if self.durability_level \
                == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE:
            target_vbs = active_vbs
            target_vbuckets = list()
            for target_node in target_nodes:
                target_vbuckets += target_vbs[target_node.ip]
        else:
            target_vbuckets = target_vbs[target_nodes[0].ip]
            if len(target_nodes) > 1:
                index = 1
                while index < len(target_nodes):
                    target_vbuckets = list(
                        set(target_vbuckets).intersection(
                            set(target_vbs[target_nodes[index].ip])))
                    index += 1

        amb_timeout = SDKException.AmbiguousTimeoutException
        kv_sync_write_in_progress = \
            SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS
        doc_not_found_exception = SDKException.DocumentNotFoundException

        self.load_data_for_sub_doc_ops()

        doc_load_spec["doc_crud"] = dict()
        doc_load_spec["subdoc_crud"] = dict()
        doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] \
            = "test_collections"
        doc_load_spec[MetaCrudParams.TARGET_VBUCKETS] = target_vbuckets
        doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level
        doc_load_spec[MetaCrudParams.COLLECTIONS_CONSIDERED_FOR_CRUD] = 5
        doc_load_spec[MetaCrudParams.SCOPES_CONSIDERED_FOR_CRUD] = "all"
        doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = 60

        # Acquire SDK client from the pool for performing doc_ops locally
        client = self.sdk_client_pool.get_client_for_bucket(self.bucket)
        # Override the crud_batch_size
        self.crud_batch_size = 5

        # Update mutation spec based on the required doc_operation
        if doc_ops == DocLoading.Bucket.DocOps.CREATE:
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 1
        elif doc_ops in DocLoading.Bucket.DocOps.UPDATE:
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 1
        elif doc_ops == DocLoading.Bucket.DocOps.DELETE:
            doc_load_spec["doc_crud"][
                MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 1
        elif doc_ops == DocLoading.Bucket.SubDocOps.INSERT:
            doc_load_spec["subdoc_crud"][
                MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 1
        elif doc_ops == DocLoading.Bucket.SubDocOps.UPSERT:
            doc_load_spec["subdoc_crud"][
                MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 1
        elif doc_ops == DocLoading.Bucket.SubDocOps.REMOVE:
            doc_load_spec["subdoc_crud"][
                MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 1

        # This is to support both sync-write and non-sync-writes
        tem_durability = self.durability_level
        if self.with_non_sync_writes:
            tem_durability = Bucket.DurabilityLevel.NONE

        # Perform specified action
        for node in target_nodes:
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=self.bucket.name)
        self.sleep(5, "Wait for error simulation to take effect")

        # Initialize tasks and store the task objects
        doc_loading_task = \
            self.bucket_util.run_scenario_from_spec(
                self.task,
                self.cluster,
                self.cluster.buckets,
                doc_load_spec,
                mutation_num=2,
                batch_size=1,
                async_load=True)

        # Start the doc_loader_task
        self.sleep(10, "Wait for task_1 CRUDs to reach server")

        for bucket, s_dict in doc_loading_task.loader_spec.items():
            for s_name, c_dict in s_dict["scopes"].items():
                for c_name, c_meta in c_dict["collections"].items():
                    for op_type in c_meta:
                        key, _ = c_meta[op_type]["doc_gen"].next()
                        expected_exception = amb_timeout
                        retry_reason = kv_sync_write_in_progress
                        if doc_ops == "create":
                            expected_exception = doc_not_found_exception
                            retry_reason = None

                        for sub_doc_op in [
                                DocLoading.Bucket.SubDocOps.INSERT,
                                DocLoading.Bucket.SubDocOps.UPSERT,
                                DocLoading.Bucket.SubDocOps.REMOVE
                        ]:
                            val = ["my_mutation", "val"]
                            if sub_doc_op \
                                    == DocLoading.Bucket.SubDocOps.REMOVE:
                                val = "mutated"
                            result = client.crud(sub_doc_op,
                                                 key,
                                                 val,
                                                 durability=tem_durability,
                                                 timeout=2)

                            if result[0]:
                                self.log_failure("Doc crud succeeded for %s" %
                                                 op_type)
                            elif expected_exception \
                                    not in str(result[1][key]["error"]):
                                self.log_failure(
                                    "Invalid exception for key %s: %s" %
                                    (key, result[1][key]["error"]))
                            elif retry_reason is not None and \
                                    retry_reason \
                                    not in str(result[1][key]["error"]):
                                self.log_failure(
                                    "Retry reason missing for key %s: %s" %
                                    (key, result[1][key]["error"]))

        # Revert the introduced error condition
        for node in target_nodes:
            error_sim[node.ip].revert(self.simulate_error,
                                      bucket_name=self.bucket.name)

        # Wait for doc_loader_task_1 to complete
        self.task.jython_task_manager.get_task_result(doc_loading_task)
        self.bucket_util.validate_doc_loading_results(doc_loading_task)
        if doc_loading_task.result is False:
            self.log_failure("Doc CRUDs failed")

        # Validate docs for update success or not
        if doc_ops == DocLoading.Bucket.DocOps.UPDATE:
            for bucket, s_dict in doc_loading_task.loader_spec.items():
                for s_name, c_dict in s_dict["scopes"].items():
                    for c_name, c_meta in c_dict["collections"].items():
                        for op_type in c_meta:
                            c_meta[op_type]["doc_gen"].reset()
                            read_task = self.task.async_load_gen_docs(
                                self.cluster,
                                self.bucket,
                                c_meta[op_type]["doc_gen"],
                                DocLoading.Bucket.DocOps.READ,
                                batch_size=self.crud_batch_size,
                                process_concurrency=1,
                                timeout_secs=self.sdk_timeout)
                            self.task_manager.get_task_result(read_task)
                            for key, doc_info in read_task.success.items():
                                if doc_info["cas"] != 0 and \
                                        json.loads(str(doc_info["value"])
                                                   )["mutated"] != 2:
                                    self.log_failure(
                                        "Update failed for key %s: %s" %
                                        (key, doc_info))

        # Release the acquired SDK client
        self.sdk_client_pool.release_client(client)

        # Verify initial doc load count
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets)
        self.bucket_util.validate_docs_per_collections_all_buckets(
            self.cluster)
        self.validate_test_failure()
コード例 #22
0
    def test_timeout_with_successful_crud(self):
        """
        Test to make sure timeout is handled in durability calls
        and no documents are loaded when durability cannot be met using
        error simulation in server node side.

        This will validate failure in majority of nodes, where durability will
        surely fail for all CRUDs

        1. Select a node from the cluster to simulate the specified error
        2. Perform CRUD on the target bucket with given timeout
        3. Using cbstats to verify no operation succeeds
        4. Revert the error scenario from the cluster to resume durability
        5. Validate all mutations are succeeded after reverting
           the error condition

        Note: self.sdk_timeout values is considered as 'seconds'
        """

        shell_conn = dict()
        cbstat_obj = dict()
        error_sim = dict()
        vb_info = dict()
        vb_info["init"] = dict()
        vb_info["afterCrud"] = dict()

        target_nodes = DurabilityHelper.getTargetNodes(self.cluster,
                                                       self.nodes_init,
                                                       self.num_nodes_affected)
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip])
            vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])

        doc_load_spec = dict()
        doc_load_spec[MetaCrudParams.SDK_TIMEOUT] = self.sdk_timeout
        doc_load_spec[MetaCrudParams.DURABILITY_LEVEL] = self.durability_level
        doc_load_spec["doc_crud"] = dict()
        doc_load_spec["subdoc_crud"] = dict()
        doc_load_spec["doc_crud"][MetaCrudParams.DocCrud.COMMON_DOC_KEY] = \
            "test_collections"
        doc_load_spec["doc_crud"][
            MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 0
        doc_load_spec["doc_crud"][
            MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 0
        doc_load_spec["doc_crud"][
            MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 0

        doc_load_spec["subdoc_crud"][
            MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 0
        doc_load_spec["subdoc_crud"][
            MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 0
        doc_load_spec["subdoc_crud"][
            MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 0

        ops_to_perform = ["create", "update", "read", "replace", "delete"]
        if self.subdoc_test:
            ops_to_perform = ["insert", "upsert", "remove"]

        for op_type in ops_to_perform:
            self.log.info("Performing '%s' with timeout=%s" %
                          (op_type, self.sdk_timeout))
            curr_spec = deepcopy(doc_load_spec)
            if op_type == "create":
                curr_spec["doc_crud"][
                    MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] \
                    = 5
            elif op_type == "update":
                curr_spec["doc_crud"][
                    MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] \
                    = 5
            elif op_type == "delete":
                curr_spec["doc_crud"][
                    MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] \
                    = 5
            elif op_type == "read":
                curr_spec["doc_crud"][
                    MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 5
                curr_spec[MetaCrudParams.RETRY_EXCEPTIONS] = [
                    SDKException.TimeoutException
                ]
            elif op_type == "insert":
                curr_spec["subdoc_crud"][
                    MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 5
            elif op_type == "upsert":
                curr_spec["subdoc_crud"][
                    MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 5
            elif op_type == "remove":
                curr_spec["subdoc_crud"][
                    MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 5

            doc_loading_task = \
                self.bucket_util.run_scenario_from_spec(
                    self.task,
                    self.cluster,
                    self.bucket_util.buckets,
                    curr_spec,
                    mutation_num=1,
                    async_load=True,
                    validate_task=False)

            # Perform specified action
            for node in target_nodes:
                error_sim[node.ip].create(self.simulate_error,
                                          bucket_name=self.bucket.name)

            self.sleep(10, "Wait before reverting the error condition")

            # Revert the specified error scenario
            for node in target_nodes:
                error_sim[node.ip].revert(self.simulate_error,
                                          bucket_name=self.bucket.name)

            self.task_manager.get_task_result(doc_loading_task)
            self.bucket_util.validate_doc_loading_results(doc_loading_task)
            if doc_loading_task.result is False:
                self.fail("Doc_loading for '%s' failed" % op_type)

            # Fetch latest stats and validate the values are updated
            for node in target_nodes:
                curr_stat = cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
                if vb_info["init"][node.ip] == curr_stat:
                    self.log_failure("vbucket_seqno not updated. %s == %s" %
                                     (vb_info["init"][node.ip], curr_stat))

        # Disconnect the shell connection
        for node in target_nodes:
            shell_conn[node.ip].disconnect()

        # Verify initial doc load count
        self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.validate_docs_per_collections_all_buckets()
        self.validate_test_failure()
コード例 #23
0
    def test_timeout_with_crud_failures(self):
        """
        Test to make sure timeout is handled in durability calls
        and no documents are loaded when durability cannot be met using
        error simulation in server node side

        This will validate failure in majority of nodes, where durability will
        surely fail for all CRUDs

        1. Select a node from the cluster to simulate the specified error
        2. Perform CRUD on the target bucket with given timeout
        3. Using cbstats to verify no operations succeeds
        4. Revert the error scenario from the cluster to resume durability
        5. Validate all mutations are succeeded after reverting
           the error condition

        Note: self.sdk_timeout values is considered as 'seconds'
        """

        # Local method to validate vb_seqno
        def validate_vb_seqno_stats():
            """
            :return retry_validation: Boolean denoting to retry validation
            """
            retry_validation = False
            vb_info["post_timeout"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            for vb_id in range(self.cluster.vbuckets):
                vb_id = str(vb_id)
                if vb_id not in affected_vbs:
                    if vb_id in vb_info["init"][node.ip].keys() \
                            and vb_info["init"][node.ip][vb_id] \
                            != vb_info["post_timeout"][node.ip][vb_id]:
                        self.log_failure(
                            "Unaffected vb-%s stat updated: %s != %s" %
                            (vb_id, vb_info["init"][node.ip][vb_id],
                             vb_info["post_timeout"][node.ip][vb_id]))
                elif int(vb_id) \
                        in target_nodes_vbuckets[Bucket.vBucket.ACTIVE]:
                    if vb_id in vb_info["init"][node.ip].keys() \
                            and vb_info["init"][node.ip][vb_id] \
                            != vb_info["post_timeout"][node.ip][vb_id]:
                        self.log.warning(
                            err_msg %
                            (node.ip, Bucket.vBucket.ACTIVE, vb_id,
                             vb_info["init"][node.ip][vb_id],
                             vb_info["post_timeout"][node.ip][vb_id]))
                elif int(vb_id) \
                        in target_nodes_vbuckets[Bucket.vBucket.REPLICA]:
                    if vb_id in vb_info["init"][node.ip].keys() \
                            and vb_info["init"][node.ip][vb_id] \
                            == vb_info["post_timeout"][node.ip][vb_id]:
                        retry_validation = True
                        self.log.warning(
                            err_msg %
                            (node.ip, Bucket.vBucket.REPLICA, vb_id,
                             vb_info["init"][node.ip][vb_id],
                             vb_info["post_timeout"][node.ip][vb_id]))
            return retry_validation

        shell_conn = dict()
        cbstat_obj = dict()
        error_sim = dict()
        target_nodes_vbuckets = dict()
        vb_info = dict()
        tasks = dict()
        doc_gen = dict()
        affected_vbs = list()

        target_nodes_vbuckets[Bucket.vBucket.ACTIVE] = list()
        target_nodes_vbuckets[Bucket.vBucket.REPLICA] = list()
        vb_info["init"] = dict()
        vb_info["post_timeout"] = dict()
        vb_info["afterCrud"] = dict()

        # Override crud_batch_size to minimum value for testing
        self.crud_batch_size = 5

        target_nodes = self.getTargetNodes()
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(node)
            target_nodes_vbuckets[Bucket.vBucket.ACTIVE] += \
                cbstat_obj[node.ip].vbucket_list(
                    self.bucket.name, vbucket_type=Bucket.vBucket.ACTIVE)
            target_nodes_vbuckets[Bucket.vBucket.REPLICA] += \
                cbstat_obj[node.ip].vbucket_list(
                    self.bucket.name, vbucket_type=Bucket.vBucket.REPLICA)
            vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])

        curr_time = int(time.time())
        expected_timeout = curr_time + self.sdk_timeout

        target_vbs = target_nodes_vbuckets[Bucket.vBucket.ACTIVE]
        if self.nodes_init == 1:
            pass
        elif self.durability_level \
                == Bucket.DurabilityLevel.PERSIST_TO_MAJORITY:
            target_vbs = target_nodes_vbuckets[Bucket.vBucket.REPLICA]

        # Create required doc_generators
        doc_gen["insert"] = sub_doc_generator(self.key,
                                              self.num_items / 2,
                                              self.crud_batch_size,
                                              target_vbucket=target_vbs,
                                              key_size=self.key_size)
        doc_gen["remove"] = sub_doc_generator_for_edit(
            self.key,
            0,
            self.crud_batch_size,
            key_size=self.key_size,
            template_index=2,
            target_vbucket=target_vbs)
        doc_gen["read"] = sub_doc_generator_for_edit(self.key,
                                                     0,
                                                     self.crud_batch_size,
                                                     key_size=self.key_size,
                                                     template_index=0,
                                                     target_vbucket=target_vbs)
        doc_gen["upsert"] = sub_doc_generator_for_edit(
            self.key,
            int(self.num_items / 4),
            self.crud_batch_size,
            key_size=self.key_size,
            template_index=1,
            target_vbucket=target_vbs)

        for op_type in doc_gen.keys():
            tasks[op_type] = self.task.async_load_gen_sub_docs(
                self.cluster,
                self.bucket,
                doc_gen[op_type],
                op_type,
                0,
                path_create=True,
                batch_size=1,
                process_concurrency=8,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout,
                start_task=False)

        # Perform specified action
        for node in target_nodes:
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=self.bucket.name)

        for op_type in doc_gen.keys():
            self.task_manager.add_new_task(tasks[op_type])

        # Wait for document_loader tasks to complete
        for op_type in doc_gen.keys():
            self.task.jython_task_manager.get_task_result(tasks[op_type])

            # Validate task failures
            if op_type == DocLoading.Bucket.DocOps.READ:
                # Validation for read task
                if len(tasks[op_type].fail.keys()) != 0:
                    self.log_failure("Read failed for few docs: %s" %
                                     tasks[op_type].fail.keys())
            else:
                # Validation of CRUDs - Update / Create / Delete
                for doc_id, crud_result in tasks[op_type].fail.items():
                    vb_num = self.bucket_util.get_vbucket_num_for_key(
                        doc_id, self.cluster.vbuckets)
                    if SDKException.DurabilityAmbiguousException \
                            not in str(crud_result["error"]):
                        self.log_failure(
                            "Invalid exception for doc %s, vb %s: %s" %
                            (doc_id, vb_num, crud_result))

        # Revert the specified error scenario
        for node in target_nodes:
            error_sim[node.ip].revert(self.simulate_error,
                                      bucket_name=self.bucket.name)

        # Check whether the timeout triggered properly
        if int(time.time()) < expected_timeout:
            self.log_failure("Timed-out before expected time")

        for op_type in doc_gen.keys():
            if op_type == DocLoading.Bucket.DocOps.READ:
                continue
            while doc_gen[op_type].has_next():
                doc_id, _ = doc_gen[op_type].next()
                affected_vbs.append(
                    str(
                        self.bucket_util.get_vbucket_num_for_key(
                            doc_id, self.cluster.vbuckets)))

        affected_vbs = list(set(affected_vbs))
        err_msg = "%s - mismatch in %s vb-%s seq_no: %s != %s"
        # Fetch latest stats and validate the seq_nos are not updated
        for node in target_nodes:
            retry_count = 0
            max_retry = 3
            while retry_count < max_retry:
                self.log.info("Trying to validate vbseq_no stats: %d" %
                              (retry_count + 1))
                retry_count += 1
                retry_required = validate_vb_seqno_stats()
                if not retry_required:
                    break
                self.sleep(5, "Sleep for vbseq_no stats to update")
            else:
                # This will be exited only if `break` condition is not met
                self.log_failure("validate_vb_seqno_stats verification failed")

        self.validate_test_failure()

        # If replicas+1 == total nodes, verify no mutation should have
        # succeeded with durability
        if self.nodes_init == self.num_replicas + 1:
            read_gen = doc_generator(self.key, 0, self.num_items)
            read_task = self.task.async_load_gen_docs(
                self.cluster,
                self.bucket,
                read_gen,
                DocLoading.Bucket.DocOps.READ,
                0,
                batch_size=500,
                process_concurrency=1,
                timeout_secs=self.sdk_timeout)
            self.task_manager.get_task_result(read_task)

            failed_keys = TableView(self.log.error)
            failed_keys.set_headers(["Key", "Error"])
            half_of_num_items = self.num_items / 2
            for doc_key, doc_info in read_task.success.items():
                key_index = int(doc_key.split("-")[1])
                expected_mutated_val = 0
                if key_index < half_of_num_items:
                    expected_mutated_val = 1
                mutated = json.loads(str(doc_info["value"]))["mutated"]
                if mutated != expected_mutated_val:
                    failed_keys.add_row([doc_key, doc_info])

            failed_keys.display("Affected mutations:")
            self.log.error(read_task.fail)

        # Doc error validation
        for op_type in doc_gen.keys():
            task = tasks[op_type]

            retry_task = self.task.async_load_gen_sub_docs(
                self.cluster,
                self.bucket,
                doc_gen[op_type],
                op_type,
                0,
                path_create=True,
                batch_size=1,
                process_concurrency=8,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout)
            self.task_manager.get_task_result(retry_task)
            retry_failures = set(retry_task.fail.keys())
            initial_failures = set(task.fail.keys())

            if len(list(retry_failures.difference(initial_failures))) != 0:
                self.log_failure("Docs failed during retry task for %s: %s" %
                                 (op_type, retry_task.fail))

        # Verify doc count after expected CRUD failure
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets)
        self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)

        # Fetch latest stats and validate the values are updated
        for node in target_nodes:
            vb_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]:
                self.log_failure("vBucket seq_no stats not updated")

        # Disconnect the shell connection
        for node in target_nodes:
            shell_conn[node.ip].disconnect()

        self.validate_test_failure()
コード例 #24
0
    def test_timeout_with_crud_failures(self):
        """
        Test to make sure timeout is handled in durability calls
        and no documents are loaded when durability cannot be met using
        error simulation in server node side

        This will validate failure in majority of nodes, where durability will
        surely fail for all CRUDs

        1. Select a node from the cluster to simulate the specified error
        2. Perform CRUD on the target bucket with given timeout
        3. Using cbstats to verify no operations succeeds
        4. Revert the error scenario from the cluster to resume durability
        5. Validate all mutations are succeeded after reverting
           the error condition

        Note: self.sdk_timeout values is considered as 'seconds'
        """

        # Local methods to validate vb_seqno

        def compare_vb_stat(stat_1, stat_2, vb, comparison="!="):
            keys_to_check = ["high_seqno", "high_completed_seqno"]
            result = True
            for key in keys_to_check:
                if vb in stat_1.keys():
                    if stat_1[vb]["uuid"] != stat_2[vb]["uuid"]:
                        self.log_failure(
                            "Mismatch in vb-%s UUID. %s != %s" %
                            (vb, stat_1[vb]["uuid"], stat_2[vb]["uuid"]))
                    if comparison == "!=":
                        if stat_1[vb][key] != stat_2[vb][key]:
                            result = False
                            self.log.warning(
                                "Mismatch in vb-%s stat %s. %s != %s" %
                                (vb, key, stat_1[vb][key], stat_2[vb][key]))
                    elif stat_1[vb][key] == stat_2[vb][key]:
                        result = False
                        self.log.warning(
                            "Stat not updated for vb-%s stat %s. "
                            "%s == %s" %
                            (vb, key, stat_1[vb][key], stat_2[vb][key]))
            return result

        def validate_vb_seqno_stats():
            """
            :return retry_validation: Boolean denoting to retry validation
            """
            retry_validation = False
            vb_info["post_timeout"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            for tem_vb_num in range(self.cluster_util.vbuckets):
                tem_vb_num = str(tem_vb_num)
                if tem_vb_num not in affected_vbs:
                    if compare_vb_stat(vb_info["init"][node.ip],
                                       vb_info["post_timeout"][node.ip],
                                       tem_vb_num) is False:
                        self.log_failure("Unaffected vb-%s stat" % tem_vb_num)
                elif int(tem_vb_num) in target_nodes_vbuckets["active"]:
                    if compare_vb_stat(vb_info["init"][node.ip],
                                       vb_info["post_timeout"][node.ip],
                                       tem_vb_num) is False:
                        self.log.warning("%s - mismatch in %s vb-%s seq_no" %
                                         (node.ip, "active", tem_vb_num))
                elif int(tem_vb_num) in target_nodes_vbuckets["replica"]:
                    if compare_vb_stat(vb_info["init"][node.ip],
                                       vb_info["post_timeout"][node.ip],
                                       tem_vb_num,
                                       comparison="==") is False:
                        retry_validation = True
                        self.log.warning("%s - mismatch in %s vb-%s seq_no" %
                                         (node.ip, "replica", tem_vb_num))
            return retry_validation

        shell_conn = dict()
        cbstat_obj = dict()
        error_sim = dict()
        target_nodes_vbuckets = dict()
        vb_info = dict()
        tasks = dict()
        doc_gen = dict()
        affected_vbs = list()

        target_nodes_vbuckets["active"] = []
        target_nodes_vbuckets["replica"] = []
        vb_info["init"] = dict()
        vb_info["post_timeout"] = dict()
        vb_info["afterCrud"] = dict()

        # Override crud_batch_size to minimum value for testing
        self.crud_batch_size = 5
        self.key = "test_collections"
        self.sdk_timeout = 3

        # Select target vbucket type to load_docs
        target_vb_type = "replica"
        if self.simulate_error == CouchbaseError.STOP_PERSISTENCE \
                and self.durability_level \
                == Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE:
            target_vb_type = "active"

        # Create required scope/collection for successful CRUD operation
        if self.scope_name != CbServer.default_scope:
            self.scope_name = self.bucket_util.get_random_name()
        self.collection_name = self.bucket_util.get_random_name()
        self.log.info("Creating scope::collection %s::%s" %
                      (self.scope_name, self.collection_name))
        self.create_scope_collection()

        # Load docs into created collection
        self.log.info("Loading data into created collection")
        load_gen = doc_generator(self.key, 0, self.num_items)
        task = self.task.async_load_gen_docs(
            self.cluster,
            self.bucket,
            load_gen,
            "create",
            0,
            scope=self.scope_name,
            collection=self.collection_name,
            sdk_client_pool=self.sdk_client_pool,
            batch_size=200,
            process_concurrency=8,
            timeout_secs=60)
        self.task_manager.get_task_result(task)
        if self.subdoc_test:
            load_gen = sub_doc_generator(self.key, 0, self.num_items / 2)
            task = self.task.async_load_gen_sub_docs(
                self.cluster,
                self.bucket,
                load_gen,
                Bucket_Op.SubDocOps.INSERT,
                timeout_secs=self.sdk_timeout,
                compression=self.sdk_compression,
                path_create=True,
                batch_size=100,
                process_concurrency=8,
                durability=self.durability_level,
                scope=self.scope_name,
                collection=self.collection_name,
                sdk_client_pool=self.sdk_client_pool)
            self.task_manager.get_task_result(task)

        self.bucket.scopes[self.scope_name].collections[
            self.collection_name].num_items = self.num_items

        target_nodes = DurabilityHelper.getTargetNodes(self.cluster,
                                                       self.nodes_init,
                                                       self.num_nodes_affected)
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip])
            target_nodes_vbuckets["active"] += \
                cbstat_obj[node.ip].vbucket_list(self.bucket.name,
                                                 vbucket_type="active")
            target_nodes_vbuckets["replica"] += \
                cbstat_obj[node.ip].vbucket_list(self.bucket.name,
                                                 vbucket_type="replica")
            vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])

        curr_time = int(time.time())
        expected_timeout = curr_time + self.sdk_timeout

        if target_vb_type == "active":
            target_vbs = list(
                set(target_nodes_vbuckets[target_vb_type]).difference(
                    set(target_nodes_vbuckets["replica"])))
        else:
            target_vbs = list(
                set(target_nodes_vbuckets[target_vb_type]).difference(
                    set(target_nodes_vbuckets["active"])))

        # Create required doc_generators
        doc_gen["create"] = doc_generator(self.key,
                                          self.num_items,
                                          self.crud_batch_size,
                                          target_vbucket=target_vbs)
        doc_gen["delete"] = doc_generator(self.key,
                                          0,
                                          self.crud_batch_size,
                                          target_vbucket=target_vbs)
        doc_gen["read"] = doc_generator(self.key,
                                        int(self.num_items / 3),
                                        self.crud_batch_size,
                                        target_vbucket=target_vbs)
        doc_gen["update"] = doc_generator(self.key,
                                          int(self.num_items / 2),
                                          self.crud_batch_size,
                                          target_vbucket=target_vbs)

        # Create required subdoc generators
        doc_gen["insert"] = sub_doc_generator(self.key,
                                              int(self.num_items / 2),
                                              self.crud_batch_size,
                                              target_vbucket=target_vbs)
        doc_gen["upsert"] = sub_doc_generator_for_edit(
            self.key,
            0,
            self.crud_batch_size,
            template_index=1,
            target_vbucket=target_vbs)
        doc_gen["remove"] = sub_doc_generator(self.key,
                                              0,
                                              self.crud_batch_size,
                                              target_vbucket=target_vbs)

        # Perform specified action
        for node in target_nodes:
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=self.bucket.name)
        self.sleep(5, "Wait for error_simulation to take effect")

        ops_to_perform = [
            Bucket_Op.DocOps.CREATE, Bucket_Op.DocOps.UPDATE,
            Bucket_Op.DocOps.READ, Bucket_Op.DocOps.DELETE
        ]
        if self.subdoc_test:
            ops_to_perform = [
                Bucket_Op.SubDocOps.INSERT, Bucket_Op.SubDocOps.UPSERT,
                Bucket_Op.SubDocOps.REMOVE
            ]

        for op_type in ops_to_perform:
            self.log.info("Starting doc op %s" % op_type)
            if op_type in Bucket_Op.DOC_OPS:
                tasks[op_type] = self.task.async_load_gen_docs(
                    self.cluster,
                    self.bucket,
                    doc_gen[op_type],
                    op_type,
                    0,
                    scope=self.scope_name,
                    collection=self.collection_name,
                    sdk_client_pool=self.sdk_client_pool,
                    batch_size=1,
                    process_concurrency=8,
                    durability=self.durability_level,
                    timeout_secs=self.sdk_timeout,
                    suppress_error_table=True,
                    print_ops_rate=False,
                    skip_read_on_error=True)
            else:
                tasks[op_type] = self.task.async_load_gen_sub_docs(
                    self.cluster,
                    self.bucket,
                    doc_gen[op_type],
                    op_type,
                    0,
                    scope=self.scope_name,
                    collection=self.collection_name,
                    sdk_client_pool=self.sdk_client_pool,
                    path_create=True,
                    batch_size=1,
                    process_concurrency=8,
                    durability=self.durability_level,
                    timeout_secs=self.sdk_timeout,
                    print_ops_rate=False)

            self.task.jython_task_manager.get_task_result(tasks[op_type])

            # Validate task failures
            if op_type == Bucket_Op.DocOps.READ:
                # Validation for read task
                if len(tasks[op_type].fail.keys()) != 0:
                    self.log_failure("Read failed for few docs: %s" %
                                     tasks[op_type].fail.keys())
            else:
                # Validation of CRUDs - Update / Create / Delete
                for doc_id, crud_result in tasks[op_type].fail.items():
                    vb_num = self.bucket_util.get_vbucket_num_for_key(
                        doc_id, self.cluster_util.vbuckets)
                    if SDKException.DurabilityAmbiguousException \
                            not in str(crud_result["error"]):
                        self.log_failure(
                            "Invalid exception for doc %s, vb %s: %s" %
                            (doc_id, vb_num, crud_result))

        # Revert the specified error scenario
        for node in target_nodes:
            error_sim[node.ip].revert(self.simulate_error,
                                      bucket_name=self.bucket.name)

        # Check whether the timeout triggered properly
        if int(time.time()) < expected_timeout:
            self.log_failure("Timed-out before expected time")

        for op_type in ops_to_perform:
            if op_type == Bucket_Op.DocOps.READ:
                continue
            while doc_gen[op_type].has_next():
                doc_id, _ = doc_gen[op_type].next()
                affected_vbs.append(
                    str(
                        self.bucket_util.get_vbucket_num_for_key(
                            doc_id, self.cluster_util.vbuckets)))

        affected_vbs = list(set(affected_vbs))
        # Fetch latest stats and validate the seq_nos are not updated
        for node in target_nodes:
            retry_count = 0
            max_retry = 3
            while retry_count < max_retry:
                self.log.info("Trying to validate vbseq_no stats: %d" %
                              (retry_count + 1))
                retry_count += 1
                retry_required = validate_vb_seqno_stats()
                if not retry_required:
                    break
                self.sleep(5, "Sleep for vbseq_no stats to update")
            else:
                # This will be exited only if `break` condition is not met
                self.log_failure("validate_vb_seqno_stats verification failed")

        self.validate_test_failure()

        # Get SDK Client from client_pool
        sdk_client = self.sdk_client_pool.get_client_for_bucket(
            self.bucket, self.scope_name, self.collection_name)

        # Doc error validation
        for op_type in ops_to_perform:
            task = tasks[op_type]

            if self.nodes_init == 1 \
                    and op_type != Bucket_Op.DocOps.READ \
                    and len(task.fail.keys()) != (doc_gen[op_type].end
                                                  - doc_gen[op_type].start):
                self.log_failure(
                    "Failed keys %d are less than expected %d" %
                    (len(task.fail.keys()),
                     (doc_gen[op_type].end - doc_gen[op_type].start)))

            # Create table objects for display
            table_view = TableView(self.log.error)
            ambiguous_table_view = TableView(self.log.info)
            table_view.set_headers(["Key", "vBucket", "Exception"])
            ambiguous_table_view.set_headers(["Key", "vBucket"])

            # Iterate failed keys for validation
            for doc_key, doc_info in task.fail.items():
                vb_for_key = self.bucket_util.get_vbucket_num_for_key(doc_key)

                if SDKException.DurabilityAmbiguousException \
                        not in str(doc_info["error"]):
                    table_view.add_row(
                        [doc_key, vb_for_key, doc_info["error"]])

                ambiguous_table_view.add_row([doc_key, str(vb_for_key)])
                if op_type not in Bucket_Op.SUB_DOC_OPS:
                    retry_success = \
                        self.durability_helper.retry_for_ambiguous_exception(
                            sdk_client, op_type, doc_key, doc_info)
                    if not retry_success:
                        self.log_failure("%s failed in retry for %s" %
                                         (op_type, doc_key))

            # Display the tables (if any errors)
            table_view.display("Unexpected exception during %s" % op_type)
            ambiguous_table_view.display("D_Ambiguous exception during %s" %
                                         op_type)

        # Release the acquired client
        self.sdk_client_pool.release_client(sdk_client)

        # Verify doc count after expected CRUD failure
        self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.validate_docs_per_collections_all_buckets()

        # Fetch latest stats and validate the values are updated
        for node in target_nodes:
            vb_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            if vb_info["init"][node.ip] == vb_info["afterCrud"][node.ip]:
                self.log_failure("vBucket seq_no stats not updated")

        # Disconnect the shell connection
        for node in target_nodes:
            shell_conn[node.ip].disconnect()

        self.validate_test_failure()
コード例 #25
0
    def test_sync_write_in_progress(self):
        """
        Test to simulate sync_write_in_progress error and validate the behavior
        This will validate failure in majority of nodes, where durability will
        surely fail for all CRUDs

        1. Select nodes to simulate the error which will affect the durability
        2. Enable the specified error_scenario on the selected nodes
        3. Perform individual CRUDs and verify sync_write_in_progress errors
        4. Validate the end results
        """

        shell_conn = dict()
        cbstat_obj = dict()
        error_sim = dict()
        vb_info = dict()
        replica_vbs = dict()
        vb_info["init"] = dict()

        # Variable to hold one of the doc_generator objects
        gen_loader = [None, None]
        doc_loader_task_1 = None
        doc_loader_task_2 = None

        # Override the crud_batch_size
        self.crud_batch_size = 5
        expected_failed_doc_num = self.crud_batch_size

        # Select nodes to affect and open required shell_connections
        target_nodes = self.getTargetNodes()
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(node)
            vb_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])
            # Fetch affected nodes' vb_num which are of type=replica
            replica_vbs[node.ip] = cbstat_obj[node.ip].vbucket_list(
                self.bucket.name, vbucket_type="replica")

        target_vbuckets = replica_vbs[target_nodes[0].ip]
        if len(target_nodes) > 1:
            index = 1
            while index < len(target_nodes):
                target_vbuckets = list(
                    set(target_vbuckets).intersection(
                        set(replica_vbs[target_nodes[index].ip])))
                index += 1

        # Initialize doc_generators to use for testing
        self.log.info("Creating doc_generators")
        gen_create = doc_generator(self.key,
                                   self.num_items,
                                   self.crud_batch_size,
                                   key_size=self.key_size,
                                   vbuckets=self.cluster.vbuckets,
                                   target_vbucket=target_vbuckets)
        gen_update_delete = doc_generator(self.key,
                                          0,
                                          self.crud_batch_size,
                                          key_size=self.key_size,
                                          vbuckets=self.cluster.vbuckets,
                                          target_vbucket=target_vbuckets,
                                          mutate=1)
        gen_subdoc = sub_doc_generator(self.key,
                                       0,
                                       self.crud_batch_size,
                                       key_size=self.key_size,
                                       vbuckets=self.cluster.vbuckets,
                                       target_vbucket=target_vbuckets)
        self.log.info("Done creating doc_generators")

        inital_num_items = self.num_items
        # Start CRUD operation based on the given 'doc_op' type
        if self.doc_ops[0] == DocLoading.Bucket.DocOps.CREATE:
            self.num_items += self.crud_batch_size
            gen_loader[0] = gen_create
        elif self.doc_ops[0] in DocLoading.Bucket.DocOps.UPDATE:
            gen_loader[0] = gen_update_delete
        elif self.doc_ops[0] == DocLoading.Bucket.DocOps.DELETE:
            gen_loader[0] = gen_update_delete
            self.num_items -= self.crud_batch_size
        elif self.doc_ops[0] in [
                DocLoading.Bucket.SubDocOps.INSERT,
                DocLoading.Bucket.SubDocOps.UPSERT,
                DocLoading.Bucket.SubDocOps.REMOVE
        ]:
            gen_loader[0] = gen_subdoc

        if self.doc_ops[1] == DocLoading.Bucket.DocOps.CREATE:
            gen_loader[1] = gen_create
        elif self.doc_ops[1] in [
                DocLoading.Bucket.DocOps.UPDATE,
                DocLoading.Bucket.DocOps.DELETE
        ]:
            gen_loader[1] = gen_update_delete
        elif self.doc_ops[1] in [
                DocLoading.Bucket.SubDocOps.INSERT,
                DocLoading.Bucket.SubDocOps.UPSERT,
                DocLoading.Bucket.SubDocOps.REMOVE
        ]:
            if self.doc_ops[1] == DocLoading.Bucket.SubDocOps.INSERT \
                    and self.doc_ops[0] == DocLoading.Bucket.DocOps.CREATE:
                gen_subdoc = sub_doc_generator(self.key,
                                               inital_num_items,
                                               self.crud_batch_size,
                                               key_size=self.key_size,
                                               vbuckets=self.cluster.vbuckets,
                                               target_vbucket=target_vbuckets)
                gen_loader[1] = gen_subdoc
            gen_loader[1] = gen_subdoc

        # Load task for further upsert / remove operations
        if (self.doc_ops[0] in [
                DocLoading.Bucket.SubDocOps.UPSERT,
                DocLoading.Bucket.SubDocOps.REMOVE
        ]) or (self.doc_ops[1] in [
                DocLoading.Bucket.SubDocOps.UPSERT,
                DocLoading.Bucket.SubDocOps.REMOVE
        ]):
            subdoc_load_task = self.task.async_load_gen_sub_docs(
                self.cluster,
                self.bucket,
                gen_subdoc,
                DocLoading.Bucket.SubDocOps.INSERT,
                path_create=True,
                batch_size=self.crud_batch_size,
                process_concurrency=8,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout)
            self.task_manager.get_task_result(subdoc_load_task)

        tem_durability = self.durability_level
        if self.with_non_sync_writes:
            tem_durability = "NONE"

        # Initialize tasks and store the task objects
        if self.doc_ops[0] in [
                DocLoading.Bucket.DocOps.CREATE,
                DocLoading.Bucket.DocOps.UPDATE,
                DocLoading.Bucket.DocOps.DELETE
        ]:
            doc_loader_task_1 = self.task.async_load_gen_docs(
                self.cluster,
                self.bucket,
                gen_loader[0],
                self.doc_ops[0],
                0,
                batch_size=1,
                process_concurrency=self.crud_batch_size,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout,
                print_ops_rate=False,
                start_task=False)
        elif self.doc_ops[0] in [
                DocLoading.Bucket.SubDocOps.INSERT,
                DocLoading.Bucket.SubDocOps.UPSERT,
                DocLoading.Bucket.SubDocOps.REMOVE
        ]:
            doc_loader_task_1 = self.task.async_load_gen_sub_docs(
                self.cluster,
                self.bucket,
                gen_loader[0],
                self.doc_ops[0],
                0,
                path_create=True,
                batch_size=1,
                process_concurrency=self.crud_batch_size,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout,
                print_ops_rate=False,
                start_task=False)

        # This will support both sync-write and non-sync-writes
        if self.doc_ops[1] in [
                DocLoading.Bucket.DocOps.CREATE,
                DocLoading.Bucket.DocOps.UPDATE,
                DocLoading.Bucket.DocOps.DELETE
        ]:
            doc_loader_task_2 = self.task.async_load_gen_docs(
                self.cluster,
                self.bucket,
                gen_loader[1],
                self.doc_ops[1],
                0,
                batch_size=self.crud_batch_size,
                process_concurrency=1,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=tem_durability,
                timeout_secs=5,
                task_identifier="parallel_task2",
                print_ops_rate=False,
                start_task=False)
        elif self.doc_ops[1] in [
                DocLoading.Bucket.SubDocOps.INSERT,
                DocLoading.Bucket.SubDocOps.UPSERT,
                DocLoading.Bucket.SubDocOps.REMOVE
        ]:
            doc_loader_task_2 = self.task.async_load_gen_sub_docs(
                self.cluster,
                self.bucket,
                gen_loader[1],
                self.doc_ops[1],
                0,
                path_create=True,
                batch_size=self.crud_batch_size,
                process_concurrency=1,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=tem_durability,
                timeout_secs=5,
                task_identifier="parallel_task2",
                print_ops_rate=False,
                start_task=False)

        # Perform specified action
        for node in target_nodes:
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=self.bucket.name)
        self.sleep(5, "Wait for error simulation to take effect")

        # Start the loader_task_1
        self.task_manager.add_new_task(doc_loader_task_1)
        self.sleep(10, "Wait for task_1 CRUDs to reach server")

        # Start the loader_task_2
        self.task_manager.add_new_task(doc_loader_task_2)
        # This task should be done will all sync_write_in_progress errors
        self.task.jython_task_manager.get_task_result(doc_loader_task_2)

        # Revert the introduced error condition
        for node in target_nodes:
            error_sim[node.ip].revert(self.simulate_error,
                                      bucket_name=self.bucket.name)

        # Wait for doc_loader_task_1 to complete
        self.task.jython_task_manager.get_task_result(doc_loader_task_1)

        # Validation to verify the sync_in_write_errors in doc_loader_task_2
        failed_docs = doc_loader_task_2.fail
        if len(failed_docs.keys()) != expected_failed_doc_num:
            self.log_failure(
                "Exception not seen for few docs: {0}".format(failed_docs))

        expected_exception = SDKException.AmbiguousTimeoutException
        retry_reason = SDKException.RetryReason.KV_SYNC_WRITE_IN_PROGRESS
        if self.doc_ops[0] in DocLoading.Bucket.DocOps.CREATE:
            expected_exception = SDKException.DocumentNotFoundException
            retry_reason = None
        valid_exception = self.durability_helper.validate_durability_exception(
            failed_docs, expected_exception, retry_reason=retry_reason)

        if not valid_exception:
            self.log_failure("Got invalid exception")

        # Validate docs for update success or not
        if self.doc_ops[0] == DocLoading.Bucket.DocOps.UPDATE:
            read_task = self.task.async_load_gen_docs(
                self.cluster,
                self.bucket,
                gen_loader[0],
                DocLoading.Bucket.DocOps.READ,
                batch_size=self.crud_batch_size,
                process_concurrency=1,
                timeout_secs=self.sdk_timeout)
            self.task_manager.get_task_result(read_task)
            for key, doc_info in read_task.success.items():
                if doc_info["cas"] != 0 \
                        and json.loads(str(doc_info["value"]))["mutated"] != 1:
                    self.log_failure("Update failed for key %s: %s" %
                                     (key, doc_info))

        # Verify initial doc load count
        self.bucket_util._wait_for_stats_all_buckets(self.cluster,
                                                     self.cluster.buckets)
        self.bucket_util.verify_stats_all_buckets(self.cluster, self.num_items)
        self.validate_test_failure()
コード例 #26
0
    def test_with_persistence_issues(self):
        """
        Test to make sure timeout is handled in durability calls
        and document CRUDs are successful even with disk related failures

        1. Select nodes from the cluster to simulate the specified error
        2. Perform CRUD on the target bucket with given timeout
        3. Using cbstats to verify the operation succeeds
        4. Validate all mutations are succeeded

        Note: self.sdk_timeout value is considered as 'seconds'
        """

        if self.durability_level in [
                Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE,
                Bucket.DurabilityLevel.PERSIST_TO_MAJORITY
        ]:
            self.log.critical("Test not valid for persistence durability")
            return

        error_sim = dict()
        shell_conn = dict()
        cbstat_obj = dict()
        failover_info = dict()
        vb_info_info = dict()
        active_vbs_in_target_nodes = list()
        failover_info["init"] = dict()
        failover_info["afterCrud"] = dict()
        vb_info_info["init"] = dict()
        vb_info_info["afterCrud"] = dict()

        self.log.info("Selecting nodes to simulate error condition")
        target_nodes = DurabilityHelper.getTargetNodes(self.cluster,
                                                       self.nodes_init,
                                                       self.num_nodes_affected)

        self.log.info("Simulate error condition on %s" % target_nodes)
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip])
            active_vbs_in_target_nodes += cbstat_obj[node.ip].vbucket_list(
                self.bucket.name, "active")
            vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            failover_info["init"][node.ip] = \
                cbstat_obj[node.ip].failover_stats(self.bucket.name)

        if self.simulate_error \
                in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]:
            error_sim = DiskError(self.log,
                                  self.task_manager,
                                  self.cluster.master,
                                  target_nodes,
                                  60,
                                  0,
                                  False,
                                  120,
                                  disk_location="/data")
            error_sim.create(action=self.simulate_error)
        else:
            for node in target_nodes:
                # Create shell_connections
                shell_conn[node.ip] = RemoteMachineShellConnection(node)

                # Perform specified action
                error_sim[node.ip] = CouchbaseError(self.log,
                                                    shell_conn[node.ip])
                error_sim[node.ip].create(self.simulate_error,
                                          bucket_name=self.bucket.name)

        # Perform CRUDs with induced error scenario is active
        load_spec = dict()
        load_spec["doc_crud"] = dict()
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 100
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 25
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 25
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections"

        self.log.info("Perform 'create', 'update', 'delete' mutations")
        doc_loading_task = \
            self.bucket_util.run_scenario_from_spec(
                self.task,
                self.cluster,
                self.bucket_util.buckets,
                load_spec,
                mutation_num=1,
                async_load=True)

        # Perform new scope/collection creation during doc ops in parallel
        self.__perform_collection_crud(mutation_num=2)

        # Wait for doc_loading to complete and validate the doc ops
        self.task_manager.get_task_result(doc_loading_task)
        self.bucket_util.validate_doc_loading_results(doc_loading_task)
        if doc_loading_task.result is False:
            self.log_failure("Doc CRUDs failed with persistence issue")

        if self.simulate_error \
                in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]:
            error_sim.revert(self.simulate_error)
        else:
            # Revert the induced error condition
            for node in target_nodes:
                error_sim[node.ip].revert(self.simulate_error,
                                          bucket_name=self.bucket.name)

                # Disconnect the shell connection
                shell_conn[node.ip].disconnect()
            self.sleep(10, "Wait for node recovery to complete")

        # Doc count validation
        self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.validate_docs_per_collections_all_buckets()

        # Fetch latest failover stats and validate the values are updated
        self.log.info("Validating failover and seqno cbstats")
        for node in target_nodes:
            vb_info_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            failover_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].failover_stats(self.bucket.name)

            # Failover validation
            val = \
                failover_info["init"][node.ip] \
                == failover_info["afterCrud"][node.ip]
            error_msg = "Failover stats got updated"
            self.assertTrue(val, msg=error_msg)

            # Seq_no validation (High level)
            val = \
                vb_info_info["init"][node.ip] \
                != vb_info_info["afterCrud"][node.ip]
            self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs")

        self.validate_test_failure()

        # Doc count validation
        self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.validate_docs_per_collections_all_buckets()
コード例 #27
0
ファイル: crash_process.py プロジェクト: AnithaKuberan/TAF
    def test_process_error_on_nodes(self):
        """
        Test to validate OoO returns feature
        1. Start parallel CRUDs using single client
        2. Perform process crash / stop with doc_ops in parallel
        3. Make sure no crash or ep_eng issue is seen with the err_simulation
        """
        tasks = list()
        node_data = dict()
        bucket = self.bucket_util.buckets[0]
        revert_errors = [
            CouchbaseError.STOP_MEMCACHED, CouchbaseError.STOP_SERVER,
            CouchbaseError.STOP_BEAMSMP, CouchbaseError.STOP_PERSISTENCE
        ]
        # Overriding sdk_timeout to max
        self.sdk_timeout = 60

        # Disable auto-failover to avoid failover of nodes
        status = RestConnection(self.cluster.master) \
            .update_autofailover_settings(False, 120, False)
        self.assertTrue(status, msg="Failure during disabling auto-failover")

        # Can take 'all_nodes' / 'single node'
        crash_on = self.input.param("crash_on", "single_node")
        error_to_simulate = self.input.param("simulate_error",
                                             CouchbaseError.KILL_MEMCACHED)
        num_times_to_affect = self.input.param("times_to_affect", 20)
        nodes_to_affect = self.cluster_util.get_kv_nodes()
        if crash_on == "single_node":
            nodes_to_affect = [choice(nodes_to_affect)]

        create_gen = doc_generator(self.key, self.num_items,
                                   self.num_items * 2)
        update_gen = doc_generator(self.key, 0, self.num_items / 2)
        delete_gen = doc_generator(self.key, self.num_items / 2,
                                   self.num_items)

        for node in nodes_to_affect:
            shell = RemoteMachineShellConnection(node)
            node_data[node] = dict()
            node_data[node]["cb_err"] = CouchbaseError(self.log, shell)

        self.log.info("Starting doc-ops")
        for doc_op in self.doc_ops:
            load_gen = update_gen
            if doc_op == DocLoading.Bucket.DocOps.CREATE:
                load_gen = create_gen
            elif doc_op == DocLoading.Bucket.DocOps.DELETE:
                load_gen = delete_gen
            task = self.task.async_load_gen_docs(
                self.cluster,
                bucket,
                load_gen,
                doc_op,
                replicate_to=self.replicate_to,
                persist_to=self.persist_to,
                durability=self.durability_level,
                timeout_secs=self.sdk_timeout,
                sdk_client_pool=self.sdk_client_pool,
                batch_size=10,
                process_concurrency=1,
                skip_read_on_error=True,
                print_ops_rate=False)
            tasks.append(task)

        self.log.info("Starting error_simulation on %s" % nodes_to_affect)
        for itr in range(1, num_times_to_affect + 1):
            self.log.info("Iteration :: %d" % itr)
            for node in nodes_to_affect:
                node_data[node]["cb_err"].create(error_to_simulate,
                                                 bucket.name)
            if error_to_simulate in revert_errors:
                self.sleep(30, "Sleep before reverting the error")
                for node in nodes_to_affect:
                    node_data[node]["cb_err"].revert(error_to_simulate,
                                                     bucket.name)
            else:
                self.sleep(10, "Wait for process to come back online")

        # Wait for doc_ops to complete
        for task in tasks:
            self.task_manager.get_task_result(task)
コード例 #28
0
    def test_with_process_crash(self):
        """
        Test to make sure durability will succeed even if a node goes down
        due to crash and has enough nodes to satisfy the durability

        1. Select a node from the cluster to simulate the specified error
        2. Perform CRUD on the target bucket with given timeout
        3. Using cbstats to verify the operation succeeds
        4. Validate all mutations are succeeded

        Note: self.sdk_timeout values is considered as 'seconds'
        """
        if self.num_replicas < 2:
            self.assertTrue(False, msg="Required: num_replicas > 1")

        # Override num_of_nodes affected to 1 (Positive case)
        self.num_nodes_affected = 1

        error_sim = dict()
        shell_conn = dict()
        cbstat_obj = dict()
        failover_info = dict()
        vb_info_info = dict()
        active_vbs_in_target_nodes = list()
        failover_info["init"] = dict()
        failover_info["afterCrud"] = dict()
        vb_info_info["init"] = dict()
        vb_info_info["afterCrud"] = dict()

        self.log.info("Selecting nodes to simulate error condition")
        target_nodes = DurabilityHelper.getTargetNodes(self.cluster,
                                                       self.nodes_init,
                                                       self.num_nodes_affected)

        self.log.info("Will simulate error condition on %s" % target_nodes)
        for node in target_nodes:
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip])
            active_vbs_in_target_nodes += cbstat_obj[node.ip].vbucket_list(
                self.bucket.name, "active")
            vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                self.bucket.name)
            failover_info["init"][node.ip] = \
                cbstat_obj[node.ip].failover_stats(self.bucket.name)

        # Remove active vbuckets from doc_loading to avoid errors
        load_spec = dict()
        load_spec["doc_crud"] = dict()
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.CREATE_PERCENTAGE_PER_COLLECTION] = 100
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.UPDATE_PERCENTAGE_PER_COLLECTION] = 25
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.DELETE_PERCENTAGE_PER_COLLECTION] = 25
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections"
        load_spec["target_vbuckets"] = list(
            set(range(0, 1024)) ^ set(active_vbs_in_target_nodes))

        self.log.info("Perform 'create', 'update', 'delete' mutations")
        doc_loading_task = \
            self.bucket_util.run_scenario_from_spec(
                self.task,
                self.cluster,
                self.bucket_util.buckets,
                load_spec,
                mutation_num=1,
                async_load=True)

        self.sleep(5, "Wait for doc loaders to start loading data")

        for node in target_nodes:
            # Create shell_connections
            shell_conn[node.ip] = RemoteMachineShellConnection(node)

            # Perform specified action
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=self.bucket.name)

        # Perform new scope/collection creation during doc ops in parallel
        self.__perform_collection_crud()

        # Wait for document_loader tasks to complete
        self.task_manager.get_task_result(doc_loading_task)
        self.bucket_util.validate_doc_loading_results(doc_loading_task)
        if doc_loading_task.result is False:
            self.log_failure("Doc CRUDs failed with process crash")

        if self.simulate_error \
                not in [DiskError.DISK_FULL, DiskError.DISK_FAILURE]:
            # Revert the induced error condition
            for node in target_nodes:
                error_sim[node.ip].revert(self.simulate_error,
                                          bucket_name=self.bucket.name)

                # Disconnect the shell connection
                shell_conn[node.ip].disconnect()
            self.sleep(10, "Wait for node recovery to complete")

            # In case of error with Ephemeral bucket, need to rebalance
            # to make sure data is redistributed properly
            if self.bucket_type == Bucket.Type.EPHEMERAL:
                retry_num = 0
                result = None
                while retry_num != 2:
                    result = self.task.rebalance(
                        self.servers[0:self.nodes_init], [], [])
                    if result:
                        break
                    retry_num += 1
                    self.sleep(10, "Wait before retrying rebalance")

                self.assertTrue(result, "Rebalance failed")

        # Fetch latest failover stats and validate the values are updated
        self.log.info("Validating failover and seqno cbstats")
        for node in target_nodes:
            vb_info_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(self.bucket.name)
            failover_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].failover_stats(self.bucket.name)

            # Failover stat validation
            if self.simulate_error == CouchbaseError.KILL_MEMCACHED:
                val = failover_info["init"][node.ip] \
                      != failover_info["afterCrud"][node.ip]
            else:
                if self.simulate_error != CouchbaseError.STOP_MEMCACHED \
                        and self.bucket_type == Bucket.Type.EPHEMERAL:
                    val = failover_info["init"][node.ip] \
                          != failover_info["afterCrud"][node.ip]
                else:
                    val = failover_info["init"][node.ip] \
                          == failover_info["afterCrud"][node.ip]
            error_msg = "Failover stats mismatch after error condition:" \
                        " %s != %s" \
                        % (failover_info["init"][node.ip],
                           failover_info["afterCrud"][node.ip])
            self.assertTrue(val, msg=error_msg)

            # Seq_no validation (High level)
            val = \
                vb_info_info["init"][node.ip] \
                != vb_info_info["afterCrud"][node.ip]
            self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs")

        # Doc count validation
        self.validate_test_failure()
        self.bucket_util.validate_docs_per_collections_all_buckets()
コード例 #29
0
ファイル: process_crash.py プロジェクト: bkumaran/TAF
    def test_stop_process(self):
        """
        1. Starting loading docs into the default bucket
        2. Stop the requested process, which will impact the
           memcached operations
        3. Wait for load bucket task to complete
        4. Validate the docs for durability
        """
        error_to_simulate = self.input.param("simulate_error", None)
        target_node = self.getTargetNode()
        remote = RemoteMachineShellConnection(target_node)
        error_sim = CouchbaseError(self.log, remote)
        target_vbuckets = CrashTest.getVbucketNumbers(
            remote, self.bucket.name, self.target_node)

        bucket_dict = BucketUtils.get_random_collections(
            self.cluster.buckets,
            req_num=1,
            consider_scopes="all",
            consider_buckets="all")

        bucket = BucketUtils.get_bucket_obj(self.cluster.buckets,
                                            bucket_dict.keys()[0])
        scope_name = bucket_dict[bucket.name]["scopes"].keys()[0]
        collection_name = bucket_dict[bucket.name][
            "scopes"][scope_name]["collections"].keys()[0]
        scope = BucketUtils.get_scope_obj(
            bucket, scope_name)
        collection = BucketUtils.get_collection_obj(scope, collection_name)

        if len(target_vbuckets) == 0:
            self.log.error("No target vbucket list generated to load data")
            remote.disconnect()
            return

        self.start_doc_loading_tasks(target_vbuckets, scope_name, collection)

        # Induce the error condition
        error_sim.create(error_to_simulate)

        self.sleep(20, "Wait before reverting the error condition")
        # Revert the simulated error condition and close the ssh session
        error_sim.revert(error_to_simulate)
        remote.disconnect()

        # Wait for doc loading task to complete
        self.task.jython_task_manager.get_task_result(self.doc_loading_task)
        if self.atomicity:
            self.task.jython_task_manager.get_task_result(
                self.transaction_load_task)
        elif self.N1qltxn:
            self.task.jython_task_manager.get_task_result(
                self.N1ql_load_task)

        if len(self.doc_loading_task.fail.keys()) != 0:
            if self.target_node == "active" or self.num_replicas in [2, 3]:
                self.log_failure("Unwanted failures for keys: %s"
                                 % self.doc_loading_task.fail.keys())

        validate_passed = \
            self.durability_helper.validate_durability_exception(
                self.doc_loading_task.fail,
                SDKException.DurabilityAmbiguousException)
        if not validate_passed:
            self.log_failure("Unwanted exception seen during validation")

        # Get SDK client for CRUD retries
        sdk_client = self.sdk_client_pool.get_client_for_bucket(self.bucket)
        for doc_key, crud_result in self.doc_loading_task.fail.items():
            result = sdk_client.crud(DocLoading.Bucket.DocOps.CREATE,
                                     doc_key,
                                     crud_result["value"],
                                     replicate_to=self.replicate_to,
                                     persist_to=self.persist_to,
                                     durability=self.durability_level,
                                     timeout=self.sdk_timeout)
            if result["status"] is False:
                self.log_failure("Retry of doc_key %s failed: %s"
                                 % (doc_key, result["error"]))
        # Close the SDK connection
        self.sdk_client_pool.release_client(sdk_client)

        self.validate_test_failure()

        self.bucket_util._wait_for_stats_all_buckets(self.cluster.buckets)
        # Update self.num_items and validate docs per collection
        if not self.N1qltxn and self.atomicity is False:
            self.bucket_util.validate_docs_per_collections_all_buckets(
                self.cluster)
コード例 #30
0
    def test_sub_doc_with_persistence_issues(self):
        """
        1. Select nodes from the cluster to simulate the specified error
        2. Perform CRUD on the target bucket with given timeout
        3. Using cbstats to verify the operation succeeds
        4. Validate all mutations met the durability condition
        """

        if self.durability_level.upper() in [
                Bucket.DurabilityLevel.MAJORITY_AND_PERSIST_TO_ACTIVE,
                Bucket.DurabilityLevel.PERSIST_TO_MAJORITY
        ]:
            self.log.critical("Test not valid for persistence durability")
            return

        error_sim = dict()
        shell_conn = dict()
        cbstat_obj = dict()
        failover_info = dict()
        vb_info_info = dict()
        active_vbs_in_target_nodes = list()
        failover_info["init"] = dict()
        failover_info["afterCrud"] = dict()
        vb_info_info["init"] = dict()
        vb_info_info["afterCrud"] = dict()
        def_bucket = self.bucket_util.buckets[0]

        load_spec = dict()
        load_spec["doc_crud"] = dict()
        load_spec["subdoc_crud"] = dict()
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.COMMON_DOC_KEY] = "test_collections"
        load_spec["doc_crud"][
            MetaCrudParams.DocCrud.READ_PERCENTAGE_PER_COLLECTION] = 50
        load_spec["subdoc_crud"][
            MetaCrudParams.SubDocCrud.INSERT_PER_COLLECTION] = 20
        load_spec["subdoc_crud"][
            MetaCrudParams.SubDocCrud.UPSERT_PER_COLLECTION] = 10
        load_spec["subdoc_crud"][
            MetaCrudParams.SubDocCrud.REMOVE_PER_COLLECTION] = 10

        self.log.info("Selecting nodes to simulate error condition")
        target_nodes = DurabilityHelper.getTargetNodes(self.cluster,
                                                       self.nodes_init,
                                                       self.num_nodes_affected)

        # Create new docs for sub-doc operations to run
        self.load_data_for_sub_doc_ops()

        self.log.info("Will simulate error condition on %s" % target_nodes)
        for node in target_nodes:
            # Create shell_connections
            shell_conn[node.ip] = RemoteMachineShellConnection(node)
            cbstat_obj[node.ip] = Cbstats(shell_conn[node.ip])
            active_vbs = cbstat_obj[node.ip].vbucket_list(
                def_bucket.name, "active")
            active_vbs_in_target_nodes += active_vbs
            vb_info_info["init"][node.ip] = cbstat_obj[node.ip].vbucket_seqno(
                def_bucket.name)
            failover_info["init"][node.ip] = \
                cbstat_obj[node.ip].failover_stats(def_bucket.name)

        for node in target_nodes:
            # Perform specified action
            error_sim[node.ip] = CouchbaseError(self.log, shell_conn[node.ip])
            error_sim[node.ip].create(self.simulate_error,
                                      bucket_name=def_bucket.name)

        # Perform CRUDs with induced error scenario is active
        self.log.info("Perform 'insert', 'upsert', 'remove' mutations")
        doc_loading_task = \
            self.bucket_util.run_scenario_from_spec(
                self.task,
                self.cluster,
                self.bucket_util.buckets,
                load_spec,
                mutation_num=0,
                async_load=True)

        # Perform new scope/collection creation during doc ops in parallel
        self.__perform_collection_crud(mutation_num=1)

        # Wait for doc_loading to complete and validate the doc ops
        self.task_manager.get_task_result(doc_loading_task)
        if doc_loading_task.result is False:
            self.log_failure("Doc CRUDs failed with persistence issue")

        # Revert the induced error condition
        for node in target_nodes:
            error_sim[node.ip].revert(self.simulate_error,
                                      bucket_name=def_bucket.name)

        # Fetch latest failover stats and validate the values are updated
        self.log.info("Validating failover and seqno cbstats")
        for node in target_nodes:
            vb_info_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].vbucket_seqno(def_bucket.name)
            failover_info["afterCrud"][node.ip] = \
                cbstat_obj[node.ip].failover_stats(def_bucket.name)

            # Failover validation
            val = \
                failover_info["init"][node.ip] \
                == failover_info["afterCrud"][node.ip]
            self.assertTrue(val, msg="Failover stats not updated")

            # Seq_no validation (High level)
            val = \
                vb_info_info["init"][node.ip] \
                != vb_info_info["afterCrud"][node.ip]
            self.assertTrue(val, msg="vbucket seq_no not updated after CRUDs")

        # Disconnect the shell connection
        for node in target_nodes:
            shell_conn[node.ip].disconnect()

        self.validate_test_failure()
        self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.validate_docs_per_collections_all_buckets()