Example #1
0
    def test_node_crash_cluster(self):
        self.setup_xdcr_and_load()

        crashed_nodes = []
        crash = self._input.param("crash", "").split('-')
        if "C1" in crash:
            crashed_nodes += self.src_cluster.get_nodes()
            self.__kill_processes(crashed_nodes)
            self.sleep(30)
        if "C2" in crash:
            crashed_nodes += self.dest_cluster.get_nodes()
            self.__kill_processes(crashed_nodes)

        for crashed_node in crashed_nodes:
            self.__start_cb_server(crashed_node)

        bucket_type = self._input.param("bucket_type", "membase")

        if "C1" in crash:
            if bucket_type == "ephemeral":
                self.sleep(self._wait_timeout)
            else:
                NodeHelper.wait_warmup_completed(self.src_cluster.get_nodes())
            gen_create = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self._num_items)
            self.src_cluster.load_all_buckets_from_generator(kv_gen=gen_create)

        self.async_perform_update_delete()

        if "C2" in crash:
            if bucket_type == "ephemeral":
                self.sleep(self._wait_timeout)
            else:
                NodeHelper.wait_warmup_completed(self.dest_cluster.get_nodes())

        self.verify_results()
Example #2
0
    def test_capi_with_malformed_http_resp(self):
        repl_id = self._start_es_replication(xdcr_params={'workerBatchSize':'2000',
                                                          'docBatchSizeKb':'8096',
                                                          'targetNozzlePerNode':'64'})

        rest_conn = RestConnection(self.src_master)

        rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'true')

        gen = DocumentGenerator('es', '{{"key":"value","mutated":0}}',  xrange(100), start=0, end=self._num_items)
        self.src_cluster.load_all_buckets_from_generator(gen)

        rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'false')

        self._wait_for_es_replication_to_catchup()

        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self.src_master)\
                     + '/goxdcr.log*'
        for node in self.src_cluster.get_nodes():
            count = NodeHelper.check_goxdcr_log(
                            node,
                            "malformed HTTP response",
                            goxdcr_log)
            self.assertEqual(count, 0, "malformed HTTP response error message found in " + str(node.ip))
            self.log.info("malformed HTTP response error message not found in " + str(node.ip))

        self._verify_es_results()
Example #3
0
    def test_checkpointing_with_full_rollback(self):
        bucket = self.src_cluster.get_buckets()[0]
        nodes = self.src_cluster.get_nodes()

        # Stop Persistence on Node A & Node B
        for node in nodes:
            mem_client = MemcachedClientHelper.direct_client(node, bucket)
            mem_client.stop_persistence()

        self.src_cluster.pause_all_replications()

        gen = BlobGenerator("C1-", "C1-", self._value_size, end=self._num_items)
        self.src_cluster.load_all_buckets_from_generator(gen)

        self.src_cluster.resume_all_replications()

        self.sleep(self._checkpoint_interval * 2)

        self.get_and_validate_latest_checkpoint()

        # Perform mutations on the bucket
        self.async_perform_update_delete()

        self.sleep(self._wait_timeout)

        # Kill memcached on Node A so that Node B becomes master
        shell = RemoteMachineShellConnection(self.src_cluster.get_master_node())
        shell.kill_memcached()

        # Start persistence on Node B
        mem_client = MemcachedClientHelper.direct_client(nodes[1], bucket)
        mem_client.start_persistence()

        # Failover Node B
        failover_task = self.src_cluster.async_failover()
        failover_task.result()

        # Wait for Failover & rollback to complete
        self.sleep(self._wait_timeout * 5)

        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0]) \
                     + '/goxdcr.log*'
        count1 = NodeHelper.check_goxdcr_log(
            nodes[0],
            "Received rollback from DCP stream",
            goxdcr_log,
            timeout=30)
        self.assertGreater(count1, 0, "full rollback not received from DCP as expected")
        self.log.info("full rollback received from DCP as expected")
        count2 = NodeHelper.check_goxdcr_log(
            nodes[0],
            "Rolled back startSeqno to 0",
            goxdcr_log,
            timeout=30)
        self.assertGreater(count2, 0, "startSeqno not rolled back to 0 as expected")
        self.log.info("startSeqno rolled back to 0 as expected")

        shell.disconnect()
Example #4
0
    def replication_while_rebooting_a_non_master_destination_node(self):
        self.setup_xdcr_and_load()
        self.src_cluster.set_xdcr_param("xdcrFailureRestartInterval", 1)
        self.perform_update_delete()
        self.sleep(self._wait_timeout / 2)
        rebooted_node = self.dest_cluster.reboot_one_node(self)
        NodeHelper.wait_node_restarted(rebooted_node, self, wait_time=self._wait_timeout * 4, wait_if_warmup=True)

        self.verify_results()
Example #5
0
    def replication_with_firewall_enabled(self):
        self.src_cluster.set_xdcr_param("xdcrFailureRestartInterval", 1)
        self.setup_xdcr_and_load()
        self.perform_update_delete()

        NodeHelper.enable_firewall(self.dest_master)
        self.sleep(30)
        NodeHelper.disable_firewall(self.dest_master)
        self.verify_results()
Example #6
0
 def is_ssl_over_memcached(self, master):
     if not NodeHelper.check_goxdcr_log(master,
                 "Try to create a ssl over memcached connection"):
         if NodeHelper.check_goxdcr_log(master,
                 "Get or create ssl over proxy connection"):
             self.log.error("SSL still uses ns_proxy connection!")
         return False
     self.log.info("SSL uses memcached after upgrade!")
     return True
Example #7
0
 def is_ssl_over_memcached(self, master):
     if not NodeHelper.check_goxdcr_log(master,
                 "Trying to create a ssl over memcached connection"):
         if NodeHelper.check_goxdcr_log(master,
                 "Get or create ssl over proxy connection"):
             self.log.error("SSL still uses ns_proxy connection!")
         return False
     self.log.info("SSL uses memcached after upgrade!")
     return True
Example #8
0
    def replication_while_rebooting_a_non_master_destination_node(self):
        self.setup_xdcr_and_load()
        self.src_cluster.set_xdcr_param("xdcrFailureRestartInterval", 1)
        self.perform_update_delete()
        self.sleep(self._wait_timeout / 2)
        rebooted_node = self.dest_cluster.reboot_one_node(self)
        NodeHelper.wait_node_restarted(rebooted_node, self, wait_time=self._wait_timeout * 4, wait_if_warmup=True)

        self.verify_results()
Example #9
0
    def replication_with_firewall_enabled(self):
        self.src_cluster.set_xdcr_param("xdcrFailureRestartInterval", 1)
        self.setup_xdcr_and_load()
        self.perform_update_delete()

        NodeHelper.enable_firewall(self.dest_master)
        self.sleep(30)
        NodeHelper.disable_firewall(self.dest_master)
        self.verify_results()
Example #10
0
    def test_checkpointing_with_full_rollback(self):
        bucket = self.src_cluster.get_buckets()[0]
        nodes = self.src_cluster.get_nodes()

        # Stop Persistence on Node A & Node B
        for node in nodes:
            mem_client = MemcachedClientHelper.direct_client(node, bucket)
            mem_client.stop_persistence()

        self.src_cluster.pause_all_replications()

        gen = BlobGenerator("C1-", "C1-", self._value_size, end=self._num_items)
        self.src_cluster.load_all_buckets_from_generator(gen)

        self.src_cluster.resume_all_replications()

        self.sleep(self._checkpoint_interval * 2)

        self.get_and_validate_latest_checkpoint()

        # Perform mutations on the bucket
        self.async_perform_update_delete()

        self.sleep(self._wait_timeout)

        # Kill memcached on Node A so that Node B becomes master
        shell = RemoteMachineShellConnection(self.src_cluster.get_master_node())
        shell.kill_memcached()

        # Start persistence on Node B
        mem_client = MemcachedClientHelper.direct_client(nodes[1], bucket)
        mem_client.start_persistence()

        # Failover Node B
        failover_task = self.src_cluster.async_failover()
        failover_task.result()

        # Wait for Failover & rollback to complete
        self.sleep(self._wait_timeout * 5)

        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0]) \
                     + '/goxdcr.log*'
        count1 = NodeHelper.check_goxdcr_log(
            nodes[0],
            "Received rollback from DCP stream",
            goxdcr_log)
        self.assertGreater(count1, 0, "full rollback not received from DCP as expected")
        self.log.info("full rollback received from DCP as expected")
        count2 = NodeHelper.check_goxdcr_log(
            nodes[0],
            "Rolled back startSeqno to 0",
            goxdcr_log)
        self.assertGreater(count2, 0, "startSeqno not rolled back to 0 as expected")
        self.log.info("startSeqno rolled back to 0 as expected")

        shell.disconnect()
Example #11
0
    def incremental_offline_upgrade(self):
        upgrade_seq = self.input.param("upgrade_seq", "src>dest")
        self._install(self.servers[:self.src_init + self.dest_init ])
        self.create_buckets()
        self._join_all_clusters()
        self.sleep(60)
        bucket = self.src_cluster.get_bucket_by_name('default')
        self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0)
        bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1')
        self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0)
        bucket = self.dest_cluster.get_bucket_by_name('sasl_bucket_1')
        gen_create2 = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self.num_items)
        self._load_bucket(bucket, self.dest_master, gen_create2, 'create', exp=0)
        self.sleep(self.wait_timeout)
        self._wait_for_replication_to_catchup()
        nodes_to_upgrade = []
        if upgrade_seq == "src>dest":
            nodes_to_upgrade = copy.copy(self.src_nodes)
            nodes_to_upgrade.extend(self.dest_nodes)
        elif upgrade_seq == "src<dest":
            nodes_to_upgrade = copy.copy(self.dest_nodes)
            nodes_to_upgrade.extend(self.src_nodes)
        elif upgrade_seq == "src><dest":
            min_cluster = min(len(self.src_nodes), len(self.dest_nodes))
            for i in xrange(min_cluster):
                nodes_to_upgrade.append(self.src_nodes[i])
                nodes_to_upgrade.append(self.dest_nodes[i])

        for _seq, node in enumerate(nodes_to_upgrade):
            self._offline_upgrade([node])
            self.sleep(60)
            bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1')
            itemPrefix = "loadThree" + _seq * 'a'
            gen_create3 = BlobGenerator(itemPrefix, itemPrefix, self._value_size, end=self.num_items)
            self._load_bucket(bucket, self.src_master, gen_create3, 'create', exp=0)
            bucket = self.src_cluster.get_bucket_by_name('default')
            itemPrefix = "loadFour" + _seq * 'a'
            gen_create4 = BlobGenerator(itemPrefix, itemPrefix, self._value_size, end=self.num_items)
            self._load_bucket(bucket, self.src_master, gen_create4, 'create', exp=0)
            self._wait_for_replication_to_catchup()
        self.merge_all_buckets()
        self.verify_results()
        self.sleep(self.wait_timeout * 5, "Let clusters work for some time")
        if float(self.initial_version[:2]) == 3.1 and float(self.upgrade_versions[0][:2]) == 4.1:
            goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'
            for node in self.src_cluster.get_nodes():
                count = NodeHelper.check_goxdcr_log(
                            node,
                            "Failed to repair connections to target cluster",
                            goxdcr_log)
                self.assertEqual(count, 0, "Failed to repair connections to target cluster "
                                        "error message found in " + str(node.ip))
                self.log.info("Failed to repair connections to target cluster "
                                        "error message not found in " + str(node.ip))
Example #12
0
    def test_retry_connections_on_errors_before_restart(self):
        """
        CBQE-3373: Do not restart pipeline as soon as connection errors are
        detected, backoff and retry 5 times before trying to restart pipeline.
        """
        passed = False
        # start data load after setting up xdcr
        load_tasks = self.setup_xdcr_async_load()
        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'

        # block port 11210 on target to simulate a connection error
        shell = RemoteMachineShellConnection(self.dest_master)
        out, err = shell.execute_command(
            "/sbin/iptables -A INPUT -p tcp --dport"
            " 11210 -j DROP")
        shell.log_command_output(out, err)
        out, err = shell.execute_command("/sbin/iptables -L")
        shell.log_command_output(out, err)

        # complete loading
        for task in load_tasks:
            task.result()

        # wait for goxdcr to detect i/o timeout and try repairing
        self.sleep(self._wait_timeout * 5)

        # unblock port 11210 so replication can continue
        out, err = shell.execute_command(
            "/sbin/iptables -D INPUT -p tcp --dport"
            " 11210 -j DROP")
        shell.log_command_output(out, err)
        out, err = shell.execute_command("/sbin/iptables -L")
        shell.log_command_output(out, err)
        shell.disconnect()

        # check logs for traces of retry attempts
        for node in self.src_cluster.get_nodes():
            count1 = NodeHelper.check_goxdcr_log(
                node, "Failed to repair connections to target cluster",
                goxdcr_log)
            count2 = NodeHelper.check_goxdcr_log(
                node, "Failed to set up connections to target cluster",
                goxdcr_log)
            count = count1 + count2
            if count > 0:
                self.log.info('SUCCESS: We tried to repair connections before'
                              ' restarting pipeline')
                passed = True

        if not passed:
            self.fail(
                "No attempts were made to repair connections on %s before"
                " restarting pipeline" % self.src_cluster.get_nodes())
        self.verify_results()
Example #13
0
    def test_retry_connections_on_errors_before_restart(self):
        """
        CBQE-3373: Do not restart pipeline as soon as connection errors are
        detected, backoff and retry 5 times before trying to restart pipeline.
        """
        passed = False
        # start data load after setting up xdcr
        load_tasks = self.setup_xdcr_async_load()
        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'

        # block port 11210 on target to simulate a connection error
        shell = RemoteMachineShellConnection(self.dest_master)
        out, err = shell.execute_command("/sbin/iptables -A INPUT -p tcp --dport"
                                         " 11210 -j DROP")
        shell.log_command_output(out, err)
        out, err = shell.execute_command("/sbin/iptables -L")
        shell.log_command_output(out, err)

        # complete loading
        for task in load_tasks:
            task.result()

        # wait for goxdcr to detect i/o timeout and try repairing
        self.sleep(self._wait_timeout*5)

        # unblock port 11210 so replication can continue
        out, err = shell.execute_command("/sbin/iptables -D INPUT -p tcp --dport"
                                         " 11210 -j DROP")
        shell.log_command_output(out, err)
        out, err = shell.execute_command("/sbin/iptables -L")
        shell.log_command_output(out, err)
        shell.disconnect()

        # check logs for traces of retry attempts
        for node in self.src_cluster.get_nodes():
            count1 = NodeHelper.check_goxdcr_log(
                            node,
                            "Failed to repair connections to target cluster",
                            goxdcr_log)
            count2 = NodeHelper.check_goxdcr_log(
                            node,
                            "Failed to set up connections to target cluster",
                            goxdcr_log)
            count = count1 + count2
            if count > 0:
                self.log.info('SUCCESS: We tried to repair connections before'
                              ' restarting pipeline')
                passed = True

        if not passed:
            self.fail("No attempts were made to repair connections on %s before"
                      " restarting pipeline" % self.src_cluster.get_nodes())
        self.verify_results()
Example #14
0
 def _verify_bandwidth_usage(self,
                             node,
                             nw_limit,
                             no_of_nodes,
                             event_time=None,
                             nw_usage="[0-9][0-9]*",
                             end_time=None):
     #nw_max = (nw_limit * 1024 * 1024) / no_of_nodes
     if event_time:
         time_to_compare = self._extract_timestamp(event_time)
     else:
         matches, count = NodeHelper.check_goxdcr_log(
             node,
             "Success adding replication specification",
             print_matches=True,
             timeout=60)
         # Time when replication was set up
         if count > 0:
             time_to_compare = self._extract_timestamp(matches[-1])
         else:
             self.fail("Replication not successful")
     nw_max = self._extract_bandwith_quota(node)
     self.sleep(60, 'Waiting for bandwidth usage logs..')
     # Try 3 times to extract current bandwidth usage from logs
     iter = 0
     while iter < 3:
         self.sleep(30, 'Waiting for bandwidth usage logs..')
         valid_count = self._extract_bandwidth_usage(
             node, time_to_compare, nw_max, nw_usage, end_time)
         if valid_count == 0 and self._input.param(
                 "replication_type") == "capi" or nw_limit == 0:
             self.log.info(
                 "Bandwidth Throttler not enabled on replication as expected"
             )
             break
         if valid_count > 0:
             break
         iter += 1
     else:
         self.fail("Bandwidth Throttler not enabled!")
     # Check if large docs are not getting stuck
     matches, src_count = NodeHelper.check_goxdcr_log(
         self.src_master,
         "The connection is ruined",
         print_matches=True,
         timeout=10)
     if src_count:
         for item in matches:
             item_datetime = self._extract_timestamp(item)
             # Ignore errors that happened before the replication was set up
             if item_datetime < time_to_compare:
                 continue
             else:
                 self.fail("Possibly hit MB-31765")
Example #15
0
    def test_verify_mb19697(self):
        self.setup_xdcr_and_load()
        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'

        self.src_cluster.pause_all_replications()

        gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000)
        self.src_cluster.load_all_buckets_from_generator(gen)

        self.src_cluster.resume_all_replications()
        self._wait_for_replication_to_catchup()

        gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000)
        load_tasks = self.src_cluster.async_load_all_buckets_from_generator(
            gen)

        self.src_cluster.rebalance_out()

        for task in load_tasks:
            task.result()

        self._wait_for_replication_to_catchup()

        self.src_cluster.rebalance_in()

        gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000)
        load_tasks = self.src_cluster.async_load_all_buckets_from_generator(
            gen)

        self.src_cluster.failover_and_rebalance_master()

        for task in load_tasks:
            task.result()

        self._wait_for_replication_to_catchup()

        for node in self.src_cluster.get_nodes():
            count = NodeHelper.check_goxdcr_log(
                node,
                "counter .+ goes backward, maybe due to the pipeline is restarted",
                goxdcr_log)
            self.assertEqual(
                count, 0,
                "counter goes backward, maybe due to the pipeline is restarted "
                "error message found in " + str(node.ip))
            self.log.info(
                "counter goes backward, maybe due to the pipeline is restarted "
                "error message not found in " + str(node.ip))

        self.sleep(300)
        self.verify_results()
Example #16
0
    def test_verify_mb20463(self):
        src_version = NodeHelper.get_cb_version(
            self.src_cluster.get_master_node())
        if float(src_version[:3]) != 4.5:
            self.log.info("Source cluster has to be at 4.5 for this test")
            return

        servs = self._input.servers[2:4]
        params = {}
        params['num_nodes'] = len(servs)
        params['product'] = 'cb'
        params['version'] = '4.1.2-6088'
        params['vbuckets'] = [1024]
        self.log.info("will install {0} on {1}".format('4.1.2-6088',
                                                       [s.ip for s in servs]))
        InstallerJob().parallel_install(servs, params)

        if params['product'] in ["couchbase", "couchbase-server", "cb"]:
            success = True
            for server in servs:
                success &= RemoteMachineShellConnection(
                    server).is_couchbase_installed()
                if not success:
                    self.fail(
                        "some nodes were not installed successfully on target cluster!"
                    )

        self.log.info("4.1.2 installed successfully on target cluster")

        conn = RestConnection(self.dest_cluster.get_master_node())
        conn.add_node(user=self._input.servers[3].rest_username,
                      password=self._input.servers[3].rest_password,
                      remoteIp=self._input.servers[3].ip)
        self.sleep(30)
        conn.rebalance(otpNodes=[node.id for node in conn.node_statuses()])
        self.sleep(30)
        conn.create_bucket(bucket='default', ramQuotaMB=512)

        tasks = self.setup_xdcr_async_load()

        self.sleep(30)

        NodeHelper.enable_firewall(self.dest_master)
        self.sleep(30)
        NodeHelper.disable_firewall(self.dest_master)

        for task in tasks:
            task.result()

        self._wait_for_replication_to_catchup(timeout=600)

        self.verify_results()
Example #17
0
 def is_goxdcr_migration_successful(self, server):
     count = NodeHelper.check_goxdcr_log(server,
                             "Starting to migrate xdcr metadata")
     if count > 0:
         count = NodeHelper.check_goxdcr_log(server,
                             "Metadata migration completed without errors")
         self.log.info(count)
         if count == 1:
             self.log.info("SUCCESS: Metadata migration completed without errors")
             return True
         self.log.error("ERROR: Metadata migration was unsuccessful")
         return False
     return True
Example #18
0
    def load_with_async_ops_with_warmup_master(self):
        self.setup_xdcr_and_load()
        warmupnodes = []
        if "C1" in self._warmup:
            warmupnodes.append(self.src_cluster.warmup_node(master=True))
        if "C2" in self._warmup:
            warmupnodes.append(self.dest_cluster.warmup_node(master=True))

        self.sleep(self._wait_timeout)
        NodeHelper.wait_warmup_completed(warmupnodes)
        self.async_perform_update_delete()
        self.sleep(self._wait_timeout / 2)
        self.verify_results()
Example #19
0
    def load_with_async_ops_with_warmup(self):
        self.setup_xdcr_and_load()
        warmupnodes = []
        if "C1" in self._warmup:
            warmupnodes.append(self.src_cluster.warmup_node())
        if "C2" in self._warmup:
            warmupnodes.append(self.dest_cluster.warmup_node())

        self.sleep(self._wait_timeout)
        NodeHelper.wait_warmup_completed(warmupnodes)
        self.async_perform_update_delete()
        self.sleep(self._wait_timeout / 2)
        self.verify_results()
Example #20
0
 def is_goxdcr_migration_successful(self, server):
     count = NodeHelper.check_goxdcr_log(server,
                             "Starting to migrate xdcr metadata")
     if count > 0:
         count = NodeHelper.check_goxdcr_log(server,
                             "Metadata migration completed without errors")
         self.log.info(count)
         if count == 1:
             self.log.info("SUCCESS: Metadata migration completed without errors")
             return True
         self.log.error("ERROR: Metadata migration was unsuccessful")
         return False
     return True
Example #21
0
    def replication_while_rebooting_a_non_master_src_dest_node(self):
        self.setup_xdcr_and_load()
        self.async_perform_update_delete()
        self.sleep(self._wait_timeout)

        reboot_node_dest = self.dest_cluster.reboot_one_node(self)
        NodeHelper.wait_node_restarted(reboot_node_dest, self, wait_time=self._wait_timeout * 4, wait_if_warmup=True)

        reboot_node_src = self.src_cluster.reboot_one_node(self)
        NodeHelper.wait_node_restarted(reboot_node_src, self, wait_time=self._wait_timeout * 4, wait_if_warmup=True)

        self.sleep(120)
        ClusterOperationHelper.wait_for_ns_servers_or_assert([reboot_node_dest], self, wait_if_warmup=True)
        ClusterOperationHelper.wait_for_ns_servers_or_assert([reboot_node_src], self, wait_if_warmup=True)
        self.verify_results()
Example #22
0
    def replication_while_rebooting_a_non_master_src_dest_node(self):
        self.setup_xdcr_and_load()
        self.async_perform_update_delete()
        self.sleep(self._wait_timeout)

        reboot_node_dest = self.dest_cluster.reboot_one_node(self)
        NodeHelper.wait_node_restarted(reboot_node_dest, self, wait_time=self._wait_timeout * 4, wait_if_warmup=True)

        reboot_node_src = self.src_cluster.reboot_one_node(self)
        NodeHelper.wait_node_restarted(reboot_node_src, self, wait_time=self._wait_timeout * 4, wait_if_warmup=True)

        self.sleep(120)
        ClusterOperationHelper.wait_for_ns_servers_or_assert([reboot_node_dest], self, wait_if_warmup=True)
        ClusterOperationHelper.wait_for_ns_servers_or_assert([reboot_node_src], self, wait_if_warmup=True)
        self.verify_results()
Example #23
0
    def load_with_async_ops_and_joint_sets_with_warmup_master(self):
        self.setup_xdcr_and_load()
        warmupnodes = []
        if "C1" in self._warmup:
            warmupnodes.append(self.src_cluster.warmup_node(master=True))
        if "C2" in self._warmup:
            warmupnodes.append(self.dest_cluster.warmup_node(master=True))

        self.sleep(self._wait_timeout)
        self.__perform_ops_joint_sets()
        self.sleep(self._wait_timeout / 2)

        NodeHelper.wait_warmup_completed(warmupnodes)

        self.verify_results()
Example #24
0
    def mutate_and_checkpoint(self, n=3, skip_validation=False):
        count = 1
        # get vb0 active source node
        stats_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/stats.log'
        active_src_node = self.get_active_vb0_node(self.src_master)
        while count <= n:
            remote_vbuuid, remote_highseqno = self.get_failover_log(
                self.dest_master)
            local_vbuuid, local_highseqno = self.get_failover_log(
                self.src_master)

            self.log.info("Local failover log: [{0}, {1}]".format(
                local_vbuuid, local_highseqno))
            self.log.info("Remote failover log: [{0}, {1}]".format(
                remote_vbuuid, remote_highseqno))
            self.log.info(
                "################ New mutation:{0} ##################".format(
                    self.key_counter + 1))
            self.load_one_mutation_into_source_vb0(active_src_node)
            self.sleep(60)
            if local_highseqno == "0":
                # avoid checking very first/empty checkpoint record
                count += 1
                continue
            end_time = time.time() + self._wait_timeout
            while time.time() < end_time:
                stats_count = NodeHelper.check_goxdcr_log(
                    active_src_node, "docs_checked,{0}".format(count),
                    stats_log)
                if stats_count > 0:
                    self.log.info("Checkpoint recorded as expected")
                    if not skip_validation:
                        self.log.info("Validating latest checkpoint")
                        self.get_and_validate_latest_checkpoint()
                    break
                else:
                    self.sleep(
                        20,
                        "Checkpoint not recorded yet, will check after 20s")
            else:
                self.log.info(
                    "Checkpointing failed - may not be an error if vb_uuid changed "
                )
                return False
            count += 1

        return True
Example #25
0
    def mutate_and_checkpoint(self, n=3, skip_validation=False):
        count = 1
        # get vb0 active source node
        active_src_node = self.get_active_vb0_node(self.src_master)
        while count <=n:
            remote_vbuuid, remote_highseqno = self.get_failover_log(self.dest_master)
            local_vbuuid, local_highseqno = self.get_failover_log(self.src_master)

            self.log.info("Local failover log: [{0}, {1}]".format(local_vbuuid,local_highseqno))
            self.log.info("Remote failover log: [{0}, {1}]".format(remote_vbuuid,remote_highseqno))
            self.log.info("################ New mutation:{0} ##################".format(self.key_counter+1))
            self.load_one_mutation_into_source_vb0(active_src_node)
            self.sleep(60)
            if local_highseqno == "0":
                # avoid checking very first/empty checkpoint record
                count += 1
                continue
            stats_count = NodeHelper.check_goxdcr_log(
                        active_src_node,
                        "docs_checked,{0}".format(count),
                        log_name="stats.log",
                        timeout=30)
            if stats_count > 0:
                self.log.info("Checkpoint recorded as expected")
                if not skip_validation:
                    self.log.info("Validating latest checkpoint")
                    self.get_and_validate_latest_checkpoint()
            else:
                self.log.info("Checkpointing failed - may not be an error if vb_uuid changed ")
                return False
            count += 1
        return True
Example #26
0
 def _extract_bandwidth_usage(self, node, time_to_compare, nw_max, nw_usage, end_time):
     valid_count = 0
     skip_count = 0
     matches, count = NodeHelper.check_goxdcr_log(node, "\\\"bandwidth_usage\\\": " + nw_usage,
                                                  print_matches=True, timeout=60)
     for item in matches:
         item_datetime = self._extract_timestamp(item)
         # Ignore entries that happened before the replication was set up
         if item_datetime < time_to_compare:
             skip_count += 1
             continue
         if end_time:
             end_datetime = self._extract_timestamp(end_time)
             if item_datetime > end_datetime:
                 skip_count += 1
                 continue
         bandwidth_usage = int(float(((item.split('{"bandwidth_usage": ')[1]).split(' ')[0]).rstrip(',')))
         if bandwidth_usage > nw_max:
             self.fail(
                 "Bandwidth usage {0} is higher than Bandwidth limit {1} in {2}".format(bandwidth_usage, nw_max,
                                                                                        item))
         self.log.info("BANDWIDTH_USAGE ={0}".format(bandwidth_usage))
         if nw_usage == "0" and bandwidth_usage != 0:
             self.fail(
                 "Expecting bandwidth usage to be 0 but it is {0}".format(bandwidth_usage))
         valid_count += 1
     self.log.info("Stale entries :{0}, Valid entries :{1}".format(skip_count, valid_count))
     return valid_count
Example #27
0
    def test_capi_with_advanced_settings(self):
        batch_count = self._input.param("batch_count", 10)
        batch_size = self._input.param("batch_size", 2048)
        source_nozzle = self._input.param("source_nozzle", 2)
        target_nozzle = self._input.param("target_nozzle", 2)
        enable_firewall = self._input.param("enable_firewall", False)

        capi_data_chan_size_multi = self._input.param(
            "capi_data_chan_size_multi", None)
        if capi_data_chan_size_multi:
            shell = RemoteMachineShellConnection(self.src_master)
            command = "curl -X POST -u Administrator:password http://127.0.0.1:9998/xdcr/internalSettings " + \
                      "-d CapiDataChanSizeMultiplier=" + str(capi_data_chan_size_multi)
            output, error = shell.execute_command(command)
            shell.log_command_output(output, error)

        repl_id = self._start_es_replication(
            xdcr_params={
                'workerBatchSize': str(batch_count),
                'docBatchSizeKb': str(batch_size),
                'sourceNozzlePerNode': str(source_nozzle),
                'targetNozzlePerNode': str(target_nozzle)
            })

        rest_conn = RestConnection(self.src_master)

        rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED,
                                          'true')

        gen = DocumentGenerator('es',
                                '{{"key":"value","mutated":0}}',
                                xrange(100),
                                start=0,
                                end=self._num_items)
        self.src_cluster.load_all_buckets_from_generator(gen)

        rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED,
                                          'false')

        if enable_firewall:
            NodeHelper.enable_firewall(self.dest_cluster.get_master_node())
            self.sleep(120)
            NodeHelper.disable_firewall(self.dest_cluster.get_master_node())

        self._wait_for_es_replication_to_catchup()

        self._verify_es_results()
Example #28
0
    def _verify_bandwidth_usage(self, node, nw_limit=1, no_of_nodes=2, event_time=None,
                                nw_usage="[1-9][0-9]*", end_time=None):
        goxdcr_log = NodeHelper.get_goxdcr_log_dir(node) + '/goxdcr.log'
        nw_max = (nw_limit * 1024 * 1024)/no_of_nodes

        if event_time:
            time_to_compare = datetime.datetime.strptime(event_time.group(), '%Y-%m-%dT%H:%M:%S')
        else:
            matches, _ = NodeHelper.check_goxdcr_log(node, "Success adding replication specification",
                                                 goxdcr_log, print_matches=True)
        #Time when replication was set up
        time_to_compare = self._extract_timestamp(matches[-1])

        matches, count = NodeHelper.check_goxdcr_log(node, "\\\"bandwidth_usage\\\": " + nw_usage, goxdcr_log, print_matches=True)
        if count < 1:
            self.fail("Bandwidth usage information not found in logs")

        match_count = 0
        skip_count = 0
        for item in matches:
            item_datetime = self._extract_timestamp(item)
            #Ignore entries that happened before the replication was set up
            if item_datetime < time_to_compare:
                skip_count += 1
                continue
            if end_time:
                end_datetime = datetime.datetime.strptime(end_time.group(), '%Y-%m-%dT%H:%M:%S')
                if item_datetime > end_datetime:
                    skip_count += 1
                    continue
            bandwidth_usage = ((item.split('{"bandwidth_usage": ')[1]).split(' ')[0]).rstrip(',')
            if int(float(bandwidth_usage)) < nw_max:
                match_count += 1
                continue
            else:
                self.fail("Bandwidth usage {0} is higher than Bandwidth limit {1} in {2}".format(bandwidth_usage,nw_max,item))

        if match_count + skip_count == count:
            self.log.info("{0} stale entries skipped".format(skip_count))
            if match_count > 0:
                self.log.info("{0} entries checked - Bandwidth usage always lower than Bandwidth limit as expected".
                          format(match_count))
            else:
                if self._input.param("replication_type") == "capi":
                    self.log.info("Bandwidth Throttler not enabled on replication as expected")
                else:
                    self.fail("Bandwidth Throttler not enabled on replication")
Example #29
0
    def test_verify_mb19697(self):
        self.setup_xdcr_and_load()
        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'

        self.src_cluster.pause_all_replications()

        gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000)
        self.src_cluster.load_all_buckets_from_generator(gen)

        self.src_cluster.resume_all_replications()
        self._wait_for_replication_to_catchup()

        gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000)
        load_tasks = self.src_cluster.async_load_all_buckets_from_generator(gen)

        self.src_cluster.rebalance_out()

        for task in load_tasks:
            task.result()

        self._wait_for_replication_to_catchup()

        self.src_cluster.rebalance_in()

        gen = BlobGenerator("C1-", "C1-", self._value_size, end=100000)
        load_tasks = self.src_cluster.async_load_all_buckets_from_generator(gen)

        self.src_cluster.failover_and_rebalance_master()

        for task in load_tasks:
            task.result()

        self._wait_for_replication_to_catchup()

        for node in self.src_cluster.get_nodes():
            count = NodeHelper.check_goxdcr_log(
                            node,
                            "counter .+ goes backward, maybe due to the pipeline is restarted",
                            goxdcr_log)
            self.assertEqual(count, 0, "counter goes backward, maybe due to the pipeline is restarted "
                                        "error message found in " + str(node.ip))
            self.log.info("counter goes backward, maybe due to the pipeline is restarted "
                                        "error message not found in " + str(node.ip))

        self.sleep(300)
        self.verify_results()
Example #30
0
    def test_verify_mb20463(self):
        src_version = NodeHelper.get_cb_version(self.src_cluster.get_master_node())
        if float(src_version[:3]) != 4.5:
            self.log.info("Source cluster has to be at 4.5 for this test")
            return

        servs = self._input.servers[2:4]
        params = {}
        params['num_nodes'] = len(servs)
        params['product'] = 'cb'
        params['version'] = '4.1.2-6088'
        params['vbuckets'] = [1024]
        self.log.info("will install {0} on {1}".format('4.1.2-6088', [s.ip for s in servs]))
        InstallerJob().parallel_install(servs, params)

        if params['product'] in ["couchbase", "couchbase-server", "cb"]:
            success = True
            for server in servs:
                success &= RemoteMachineShellConnection(server).is_couchbase_installed()
                if not success:
                    self.fail("some nodes were not installed successfully on target cluster!")

        self.log.info("4.1.2 installed successfully on target cluster")

        conn = RestConnection(self.dest_cluster.get_master_node())
        conn.add_node(user=self._input.servers[3].rest_username, password=self._input.servers[3].rest_password,
                      remoteIp=self._input.servers[3].ip)
        self.sleep(30)
        conn.rebalance(otpNodes=[node.id for node in conn.node_statuses()])
        self.sleep(30)
        conn.create_bucket(bucket='default', ramQuotaMB=512)

        tasks = self.setup_xdcr_async_load()

        self.sleep(30)

        NodeHelper.enable_firewall(self.dest_master)
        self.sleep(30)
        NodeHelper.disable_firewall(self.dest_master)

        for task in tasks:
            task.result()

        self._wait_for_replication_to_catchup(timeout=600)

        self.verify_results()
Example #31
0
    def _verify_bandwidth_usage(self, node, nw_limit=1, no_of_nodes=2, event_time=None,
                                nw_usage="[1-9][0-9]*", end_time=None):
        goxdcr_log = NodeHelper.get_goxdcr_log_dir(node) + '/goxdcr.log'
        nw_max = (nw_limit * 1024 * 1024)/no_of_nodes

        if event_time:
            time_to_compare = time.strptime(event_time, '%Y-%m-%dT%H:%M:%S')
        else:
            matches, _ = NodeHelper.check_goxdcr_log(node, "Success adding replication specification",
                                                 goxdcr_log, print_matches=True)
            time_to_compare_str = matches[-1].split(' ')[0].split('.')[0]
            time_to_compare = time.strptime(time_to_compare_str, '%Y-%m-%dT%H:%M:%S')

        matches, count = NodeHelper.check_goxdcr_log(node, "bandwidth_limit=" + str(nw_max) +
                                            ", bandwidth_usage=" + nw_usage, goxdcr_log, print_matches=True)
        match_count = 0
        skip_count = 0
        for item in matches:
            items = item.split(' ')
            item_time = items[0].split('.')[0]
            item_datetime = time.strptime(item_time, '%Y-%m-%dT%H:%M:%S')
            if item_datetime < time_to_compare:
                skip_count += 1
                continue
            if end_time:
                end_datetime = time.strptime(end_time, '%Y-%m-%dT%H:%M:%S')
                if item_datetime > end_datetime:
                    skip_count += 1
                    continue
            bandwidth_usage = items[-1].split('=')[-1]
            if int(bandwidth_usage) <= nw_max:
                match_count += 1
                continue
            else:
                self.fail("Bandwidth usage higher than Bandwidth limit in {0}".format(item))

        if match_count + skip_count == count:
            self.log.info("{0} stale entries skipped".format(skip_count))
            if match_count > 0:
                self.log.info("{0} entries checked - Bandwidth usage always lower than Bandwidth limit as expected".
                          format(match_count))
            else:
                if self._input.param("replication_type") == "capi":
                    self.log.info("Bandwidth Throttler not enabled on replication as expected")
                else:
                    self.fail("Bandwidth Throttler not enabled on replication")
Example #32
0
    def test_node_crash_master(self):
        self.setup_xdcr_and_load()

        crashed_nodes = []
        crash = self._input.param("crash", "").split('-')
        if "C1" in crash:
            crashed_nodes.append(self.src_master)
        if "C2" in crash:
            crashed_nodes.append(self.dest_master)

        self.__kill_processes(crashed_nodes)

        for crashed_node in crashed_nodes:
            self.__start_cb_server(crashed_node)
        NodeHelper.wait_warmup_completed(crashed_nodes)

        self.async_perform_update_delete()
        self.verify_results()
Example #33
0
    def test_node_crash_master(self):
        self.setup_xdcr_and_load()

        crashed_nodes = []
        crash = self._input.param("crash", "").split('-')
        if "C1" in crash:
            crashed_nodes.append(self.src_master)
        if "C2" in crash:
            crashed_nodes.append(self.dest_master)

        self.__kill_processes(crashed_nodes)

        for crashed_node in crashed_nodes:
            self.__start_cb_server(crashed_node)
        NodeHelper.wait_warmup_completed(crashed_nodes)

        self.async_perform_update_delete()
        self.verify_results()
Example #34
0
 def test_update_to_scramsha_auth(self):
     """
     Start with ordinary replication, then switch to use scram_sha_auth
     Search for success log stmtsS
     """
     old_count = NodeHelper.check_goxdcr_log(self.src_cluster.get_master_node(),
                                             "HttpAuthMech=ScramSha for remote cluster reference remote_cluster")
     self.setup_xdcr()
     # modify remote cluster ref to use scramsha
     for remote_cluster in self.src_cluster.get_remote_clusters()+self.dest_cluster.get_remote_clusters():
         remote_cluster.use_scram_sha_auth()
     self.sleep(60, "wait before checking the logs for using scram-sha")
     for node in [self.src_cluster.get_master_node()]+[self.dest_cluster.get_master_node()]:
         count = NodeHelper.check_goxdcr_log(node, "HttpAuthMech=ScramSha for remote cluster reference remote_cluster")
         if count <= old_count:
             self.fail("Node {0} does not use SCRAM-SHA authentication".format(node.ip))
         else:
             self.log.info("SCRAM-SHA auth successful on node {0}".format(node.ip))
     self.verify_results()
Example #35
0
    def load_with_async_ops_and_joint_sets_with_warmup(self):
        bucket_type = self._input.param("bucket_type", "membase")
        if bucket_type == "ephemeral":
            "Test case does not apply for Ephemeral buckets"
            return
        self.setup_xdcr_and_load()
        warmupnodes = []
        if "C1" in self._warmup:
            warmupnodes.append(self.src_cluster.warmup_node())
        if "C2" in self._warmup:
            warmupnodes.append(self.dest_cluster.warmup_node())

        self.sleep(self._wait_timeout)
        self.async_perform_update_delete()
        self.sleep(self._wait_timeout / 2)

        NodeHelper.wait_warmup_completed(warmupnodes)

        self.verify_results()
Example #36
0
    def load_with_async_ops_with_warmup(self):
        bucket_type = self._input.param("bucket_type", "membase")
        if bucket_type == "ephemeral":
            "Test case does not apply for Ephemeral buckets"
            return
        self.setup_xdcr_and_load()
        warmupnodes = []
        if "C1" in self._warmup:
            warmupnodes.append(self.src_cluster.warmup_node())
        if "C2" in self._warmup:
            warmupnodes.append(self.dest_cluster.warmup_node())

        self.sleep(self._wait_timeout)
        self.async_perform_update_delete()
        self.sleep(self._wait_timeout / 2)

        NodeHelper.wait_warmup_completed(warmupnodes)

        self.verify_results()
Example #37
0
 def _extract_bandwith_quota(self, node):
     matches, count = NodeHelper.check_goxdcr_log(node,
                                                  "bandwidth_usage_quota=" +
                                                  "[0-9][0-9]*",
                                                  print_matches=True,
                                                  timeout=60)
     bandwidth_quota = int(
         float(((
             matches[-1].split('bandwidth_usage_quota=')[1]).rstrip(' '))))
     return bandwidth_quota
Example #38
0
    def replication_while_rebooting_a_non_master_src_dest_node(self):
        bucket_type = self._input.param("bucket_type", "membase")
        if bucket_type == "ephemeral":
            self.log.info("Test case does not apply to ephemeral")
            return
        self.setup_xdcr_and_load()
        self.async_perform_update_delete()
        self.sleep(self._wait_timeout)

        reboot_node_dest = self.dest_cluster.reboot_one_node(self)
        NodeHelper.wait_node_restarted(reboot_node_dest, self, wait_time=self._wait_timeout * 4, wait_if_warmup=True)

        reboot_node_src = self.src_cluster.reboot_one_node(self)
        NodeHelper.wait_node_restarted(reboot_node_src, self, wait_time=self._wait_timeout * 4, wait_if_warmup=True)

        self.sleep(120)
        ClusterOperationHelper.wait_for_ns_servers_or_assert([reboot_node_dest], self, wait_if_warmup=True)
        ClusterOperationHelper.wait_for_ns_servers_or_assert([reboot_node_src], self, wait_if_warmup=True)
        self.verify_results()
Example #39
0
    def test_verify_mb19181(self):
        load_tasks = self.setup_xdcr_async_load()
        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0]) \
                     + '/goxdcr.log*'

        self.dest_cluster.failover_and_rebalance_master()

        for task in load_tasks:
            task.result()

        for node in self.src_cluster.get_nodes():
            count = NodeHelper.check_goxdcr_log(
                node,
                "Can't move update state from",
                goxdcr_log)
            self.assertEqual(count, 0, "Can't move update state from - error message found in " + str(node.ip))
            self.log.info("Can't move update state from - error message not found in " + str(node.ip))

        self.verify_results()
Example #40
0
    def test_verify_mb19802_2(self):
        load_tasks = self.setup_xdcr_async_load()
        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'

        self.dest_cluster.failover_and_rebalance_master()

        for task in load_tasks:
            task.result()

        for node in self.src_cluster.get_nodes():
            count = NodeHelper.check_goxdcr_log(
                            node,
                            "batchGetMeta received fatal error and had to abort",
                            goxdcr_log)
            self.assertEqual(count, 0, "batchGetMeta timed out error message found in " + str(node.ip))
            self.log.info("batchGetMeta error message not found in " + str(node.ip))

        self.sleep(300)
        self.verify_results()
Example #41
0
    def test_verify_mb19802_2(self):
        load_tasks = self.setup_xdcr_async_load()
        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'

        self.dest_cluster.failover_and_rebalance_master()

        for task in load_tasks:
            task.result()

        for node in self.src_cluster.get_nodes():
            count = NodeHelper.check_goxdcr_log(node, "batchGetMeta timed out",
                                                goxdcr_log)
            self.assertEqual(
                count, 0, "batchGetMeta timed out error message found in " +
                str(node.ip))
            self.log.info(
                "batchGetMeta timed out error message not found in " +
                str(node.ip))

        self.verify_results()
Example #42
0
    def test_verify_mb19802_1(self):
        load_tasks = self.setup_xdcr_async_load()
        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'

        conn = RemoteMachineShellConnection(self.dest_cluster.get_master_node())
        conn.stop_couchbase()

        for task in load_tasks:
            task.result()

        for node in self.src_cluster.get_nodes():
            count = NodeHelper.check_goxdcr_log(
                            node,
                            "batchGetMeta timed out",
                            goxdcr_log)
            self.assertEqual(count, 0, "batchGetMeta timed out error message found in " + str(node.ip))
            self.log.info("batchGetMeta timed out error message not found in " + str(node.ip))

        conn.start_couchbase()
        self.verify_results()
Example #43
0
 def get_checkpoint_call_history(self, node):
     chkpts, count = NodeHelper.check_goxdcr_log(node,
                                                "num_checkpoints",
                                                log_name="stats.log",
                                                print_matches=True,
                                                timeout=10)
     if count > 0:
         total_successful_chkpts = int((chkpts[-1].split('num_checkpoints,')[1]).rstrip('},'))
     else:
         total_successful_chkpts = 0
     self.log.info(total_successful_chkpts)
     chkpts, count = NodeHelper.check_goxdcr_log(node,
                                               "num_failedckpts",
                                               log_name="stats.log",
                                               print_matches=True,
                                               timeout=10)
     if count > 0:
         total_failed_chkpts = int((chkpts[-1].split('num_failedckpts,')[1]).rstrip('},'))
     else:
         total_failed_chkpts = 0
     return total_successful_chkpts + total_failed_chkpts, total_successful_chkpts, total_failed_chkpts
Example #44
0
    def test_capi_with_malformed_http_resp(self):
        repl_id = self._start_es_replication(
            xdcr_params={
                'workerBatchSize': '2000',
                'docBatchSizeKb': '8096',
                'targetNozzlePerNode': '64'
            })

        rest_conn = RestConnection(self.src_master)

        rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED,
                                          'true')

        gen = DocumentGenerator('es',
                                '{{"key":"value","mutated":0}}',
                                xrange(100),
                                start=0,
                                end=self._num_items)
        self.src_cluster.load_all_buckets_from_generator(gen)

        rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED,
                                          'false')

        self._wait_for_es_replication_to_catchup()

        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self.src_master)\
                     + '/goxdcr.log*'
        for node in self.src_cluster.get_nodes():
            count = NodeHelper.check_goxdcr_log(node,
                                                "malformed HTTP response",
                                                goxdcr_log)
            self.assertEqual(
                count, 0, "malformed HTTP response error message found in " +
                str(node.ip))
            self.log.info(
                "malformed HTTP response error message not found in " +
                str(node.ip))

        self._verify_es_results()
Example #45
0
 def mutate_and_check_error404(self, n=1):
     # get vb0 active source node
     active_src_node = self.get_active_vb0_node(self.src_master)
     num_404_errors_before_load = NodeHelper.check_goxdcr_log(
                                         active_src_node,
                                         "ERRO GOXDCR.CheckpointMgr: GetRemoteMemcachedConnection Operation failed after max retries",
                                         timeout=30)
     self.sleep(60)
     self.log.info("################ New mutation:{0} ##################".format(self.key_counter+1))
     self.load_one_mutation_into_source_vb0(active_src_node)
     self.sleep(5)
     num_404_errors_after_load = NodeHelper.check_goxdcr_log(
                                         active_src_node,
                                         "ERRO GOXDCR.CheckpointMgr: GetRemoteMemcachedConnection Operation failed after max retries",
                                         timeout=30)
     if num_404_errors_after_load > num_404_errors_before_load:
         self.log.info("Topology change verified after dest failover/rebalance out")
         return True
     else:
         self.log.info("404 errors on source node before last load : {0}, after last node: {1}".
                       format(num_404_errors_before_load, num_404_errors_after_load))
         self.log.error("Topology change NOT recorded at source following dest failover or rebalance!")
Example #46
0
    def mutate_and_checkpoint(self, n=3, skip_validation=False):
        count = 1
        # get vb0 active source node
        stats_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/stats.log'
        active_src_node = self.get_active_vb0_node(self.src_master)
        while count <=n:
            remote_vbuuid, remote_highseqno = self.get_failover_log(self.dest_master)
            local_vbuuid, local_highseqno = self.get_failover_log(self.src_master)

            self.log.info("Local failover log: [{0}, {1}]".format(local_vbuuid,local_highseqno))
            self.log.info("Remote failover log: [{0}, {1}]".format(remote_vbuuid,remote_highseqno))
            self.log.info("################ New mutation:{0} ##################".format(self.key_counter+1))
            self.load_one_mutation_into_source_vb0(active_src_node)
            self.sleep(60)
            if local_highseqno == "0":
                # avoid checking very first/empty checkpoint record
                count += 1
                continue
            end_time = time.time() + self._wait_timeout
            while time.time() < end_time:
                stats_count = NodeHelper.check_goxdcr_log(
                            active_src_node,
                            "docs_checked,{0}".format(count),
                            stats_log)
                if stats_count > 0:
                    self.log.info("Checkpoint recorded as expected")
                    if not skip_validation:
                        self.log.info("Validating latest checkpoint")
                        self.get_and_validate_latest_checkpoint()
                    break
                else:
                    self.sleep(20, "Checkpoint not recorded yet, will check after 20s")
            else:
                self.log.info("Checkpointing failed - may not be an error if vb_uuid changed ")
                return False
            count += 1

        return True
    def test_node_crash_cluster(self):
        self.setup_xdcr_and_load()

        crashed_nodes = []
        crash = self._input.param("crash", "").split('-')
        if "C1" in crash:
            crashed_nodes += self.src_cluster.get_nodes()
            self.__kill_processes(crashed_nodes)
            self.sleep(30)
        if "C2" in crash:
            crashed_nodes += self.dest_cluster.get_nodes()
            self.__kill_processes(crashed_nodes)

        for crashed_node in crashed_nodes:
            self.__start_cb_server(crashed_node)

        bucket_type = self._input.param("bucket_type", "membase")

        if "C1" in crash:
            if bucket_type == "ephemeral":
                self.sleep(self._wait_timeout)
            else:
                NodeHelper.wait_warmup_completed(self.src_cluster.get_nodes())
            gen_create = BlobGenerator('loadTwo',
                                       'loadTwo',
                                       self._value_size,
                                       end=self._num_items)
            self.src_cluster.load_all_buckets_from_generator(kv_gen=gen_create)

        self.async_perform_update_delete()

        if "C2" in crash:
            if bucket_type == "ephemeral":
                self.sleep(self._wait_timeout)
            else:
                NodeHelper.wait_warmup_completed(self.dest_cluster.get_nodes())

        self.verify_results()
    def test_node_crash_master(self):
        self.setup_xdcr_and_load()

        crashed_nodes = []
        crash = self._input.param("crash", "").split('-')
        if "C1" in crash:
            crashed_nodes.append(self.src_master)
        if "C2" in crash:
            crashed_nodes.append(self.dest_master)

        self.__kill_processes(crashed_nodes)

        for crashed_node in crashed_nodes:
            self.__start_cb_server(crashed_node)

        bucket_type = self._input.param("bucket_type", "membase")
        if bucket_type == "ephemeral":
            self.sleep(self._wait_timeout)
        else:
            NodeHelper.wait_warmup_completed(crashed_nodes)

        self.async_perform_update_delete()
        self.verify_results()
Example #49
0
    def test_node_crash_master(self):
        self.setup_xdcr_and_load()

        crashed_nodes = []
        crash = self._input.param("crash", "").split('-')
        if "C1" in crash:
            crashed_nodes.append(self.src_master)
        if "C2" in crash:
            crashed_nodes.append(self.dest_master)

        self.__kill_processes(crashed_nodes)

        for crashed_node in crashed_nodes:
            self.__start_cb_server(crashed_node)

        bucket_type = self._input.param("bucket_type", "membase")
        if bucket_type == "ephemeral":
            self.sleep(self._wait_timeout)
        else:
            NodeHelper.wait_warmup_completed(crashed_nodes)

        self.async_perform_update_delete()
        self.verify_results()
Example #50
0
    def test_capi_with_advanced_settings(self):
        batch_count = self._input.param("batch_count", 10)
        batch_size = self._input.param("batch_size", 2048)
        source_nozzle = self._input.param("source_nozzle", 2)
        target_nozzle = self._input.param("target_nozzle", 2)
        enable_firewall = self._input.param("enable_firewall", False)

        capi_data_chan_size_multi = self._input.param("capi_data_chan_size_multi", None)
        if capi_data_chan_size_multi:
            shell = RemoteMachineShellConnection(self.src_master)
            command = "curl -X POST -u Administrator:password http://127.0.0.1:9998/xdcr/internalSettings " + \
                      "-d CapiDataChanSizeMultiplier=" + str(capi_data_chan_size_multi)
            output, error = shell.execute_command(command)
            shell.log_command_output(output, error)

        repl_id = self._start_es_replication()

        rest_conn = RestConnection(self.src_master)

        rest_conn.set_xdcr_param('default', 'default', 'workerBatchSize', batch_count)
        rest_conn.set_xdcr_param('default', 'default', 'docBatchSizeKb', batch_size)
        rest_conn.set_xdcr_param('default', 'default', 'sourceNozzlePerNode', source_nozzle)
        rest_conn.set_xdcr_param('default', 'default', 'targetNozzlePerNode', target_nozzle)

        rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'true')

        gen = DocumentGenerator('es', '{{"key":"value","mutated":0}}',  xrange(100), start=0, end=self._num_items)
        self.src_cluster.load_all_buckets_from_generator(gen)

        rest_conn.pause_resume_repl_by_id(repl_id, REPL_PARAM.PAUSE_REQUESTED, 'false')

        if enable_firewall:
            NodeHelper.enable_firewall(self.dest_cluster.get_master_node())
            self.sleep(120)
            NodeHelper.disable_firewall(self.dest_cluster.get_master_node())

        self._verify_es_results()
Example #51
0
    def test_capi_with_malformed_http_resp(self):
        self.setup_xdcr()

        rest_conn = RestConnection(self.src_master)

        rest_conn.set_xdcr_param('default', 'default', 'workerBatchSize', 2000)
        rest_conn.set_xdcr_param('default', 'default', 'docBatchSizeKb', 8096)
        rest_conn.set_xdcr_param('default', 'default', 'targetNozzlePerNode',
                                 64)

        self.src_cluster.pause_all_replications()

        gen = DocumentGenerator('es',
                                '{{"key":"value","mutated":0}}',
                                xrange(100),
                                start=0,
                                end=self._num_items)
        self.src_cluster.load_all_buckets_from_generator(gen)

        self.src_cluster.resume_all_replications()

        self._wait_for_replication_to_catchup()

        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self.src_master) \
                     + '/goxdcr.log*'
        for node in self.src_cluster.get_nodes():
            count = NodeHelper.check_goxdcr_log(node,
                                                "malformed HTTP response",
                                                goxdcr_log)
            self.assertEqual(
                count, 0, "malformed HTTP response error message found in " +
                str(node.ip))
            self.log.info(
                "malformed HTTP response error message not found in " +
                str(node.ip))

        self._verify_es_results()
Example #52
0
 def get_pre_replicate_call_history(self, node):
     prerep_calls, count = NodeHelper.check_goxdcr_log(node,
                                                     "POST /_goxdcr/_pre_replicate",
                                                     log_name="http_access.log",
                                                     timeout=10,
                                                     print_matches=True)
     if count > 0:
         total_successful_prereps = 0
         for call in prerep_calls:
             call_datetime = self._extract_timestamp(call)
             # Ignore calls that happened before the test started
             if call_datetime < self.time_test_started:
                 continue
             total_successful_prereps += 1
     return total_successful_prereps
Example #53
0
    def test_verify_mb19802_1(self):
        load_tasks = self.setup_xdcr_async_load()
        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'

        conn = RemoteMachineShellConnection(
            self.dest_cluster.get_master_node())
        conn.stop_couchbase()

        for task in load_tasks:
            task.result()

        for node in self.src_cluster.get_nodes():
            count = NodeHelper.check_goxdcr_log(node, "batchGetMeta timed out",
                                                goxdcr_log)
            self.assertEqual(
                count, 0, "batchGetMeta timed out error message found in " +
                str(node.ip))
            self.log.info(
                "batchGetMeta timed out error message not found in " +
                str(node.ip))

        conn.start_couchbase()
        self.verify_results()
Example #54
0
 def test_scramsha(self):
     """
     Creates a new bi-xdcr replication with scram-sha
     Make sure to pass use-scramsha=True
     from command line
     """
     self.setup_xdcr()
     self.sleep(60, "wait before checking logs")
     for node in [self.src_cluster.get_master_node()]+[self.dest_cluster.get_master_node()]:
         count = NodeHelper.check_goxdcr_log(node,
                     "HttpAuthMech=ScramSha for remote cluster reference remote_cluster")
         if count <= 0:
             self.fail("Node {0} does not use SCRAM-SHA authentication".format(node.ip))
         else:
             self.log.info("SCRAM-SHA auth successful on node {0}".format(node.ip))
     self.verify_results()
Example #55
0
 def test_scramsha(self):
     """
     Creates a new bi-xdcr replication with scram-sha
     Make sure to pass use-scramsha=True
     from command line
     """
     self.setup_xdcr()
     self.sleep(60, "wait before checking logs")
     for node in [self.src_cluster.get_master_node()
                  ] + [self.dest_cluster.get_master_node()]:
         count = NodeHelper.check_goxdcr_log(
             node,
             "HttpAuthMech=ScramSha for remote cluster reference remote_cluster",
             timeout=60)
         if count <= 0:
             self.fail(
                 "Node {0} does not use SCRAM-SHA authentication".format(
                     node.ip))
         else:
             self.log.info("SCRAM-SHA auth successful on node {0}".format(
                 node.ip))
     self.verify_results()
Example #56
0
 def __kill_processes(self, crashed_nodes=[]):
     for node in crashed_nodes:
         NodeHelper.kill_erlang(node)
Example #57
0
    def test_backward_compatibility(self):
        self.c1_version = self.initial_version
        self.c2_version = self.upgrade_versions[0]
        # install older version on C1
        self._install(self.servers[:self.src_init])
        #install latest version on C2
        self.initial_version = self.c2_version
        self._install(self.servers[self.src_init:])
        self.initial_version = self.c1_version
        self.create_buckets()
        # workaround for MB-15761
        if float(self.initial_version[:2]) < 3.0 and self._demand_encryption:
            rest = RestConnection(self.dest_master)
            rest.set_internalSetting('certUseSha1',"true")
            rest.regenerate_cluster_certificate()
        self._join_all_clusters()

        if float(self.c1_version[:2]) >= 3.0:
            for cluster in self.get_cb_clusters():
                for remote_cluster in cluster.get_remote_clusters():
                    remote_cluster.pause_all_replications()

        self.sleep(60)
        bucket = self.src_cluster.get_bucket_by_name('default')
        self._operations()
        self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0)
        bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1')
        self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0)
        bucket = self.dest_cluster.get_bucket_by_name('standard_bucket_1')
        gen_create2 = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self.num_items)
        self._load_bucket(bucket, self.dest_master, gen_create2, 'create', exp=0)

        if float(self.c1_version[:2]) >= 3.0:
            for cluster in self.get_cb_clusters():
                for remote_cluster in cluster.get_remote_clusters():
                    remote_cluster.resume_all_replications()

        self._wait_for_replication_to_catchup()

        if float(self.c1_version[:2]) > 2.5:
            for remote_cluster in self.src_cluster.get_remote_clusters():
                remote_cluster.modify()
            for remote_cluster in self.dest_cluster.get_remote_clusters():
                remote_cluster.modify()

        self.sleep(30)

        bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1')
        gen_create3 = BlobGenerator('loadThree', 'loadThree', self._value_size, end=self.num_items)
        self._load_bucket(bucket, self.src_master, gen_create3, 'create', exp=0)
        bucket = self.dest_cluster.get_bucket_by_name('sasl_bucket_1')
        gen_create4 = BlobGenerator('loadFour', 'loadFour', self._value_size, end=self.num_items)
        self._load_bucket(bucket, self.dest_master, gen_create4, 'create', exp=0)
        bucket = self.src_cluster.get_bucket_by_name('default')
        self._load_bucket(bucket, self.src_master, gen_create2, 'create', exp=0)

        self.merge_all_buckets()
        self.sleep(60)
        self._post_upgrade_ops()
        self.sleep(60)
        self.verify_results()
        if float(self.initial_version[:2]) == 3.1 and float(self.upgrade_versions[0][:2]) == 4.1:
            goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'
            for node in self.src_cluster.get_nodes():
                count = NodeHelper.check_goxdcr_log(
                            node,
                            "Failed to repair connections to target cluster",
                            goxdcr_log)
                self.assertEqual(count, 0, "Failed to repair connections to target cluster "
                                        "error message found in " + str(node.ip))
                self.log.info("Failed to repair connections to target cluster "
                                        "error message not found in " + str(node.ip))
Example #58
0
    def online_cluster_upgrade(self):
        self._install(self.servers[:self.src_init + self.dest_init])
        prev_initial_version = self.initial_version
        self.initial_version = self.upgrade_versions[0]
        self._install(self.servers[self.src_init + self.dest_init:])
        self.create_buckets()
        self._join_all_clusters()

        if float(prev_initial_version[:2]) < 3.0:
            self.pause_xdcr_cluster = None

        bucket_default = self.src_cluster.get_bucket_by_name('default')
        bucket_sasl = self.src_cluster.get_bucket_by_name('sasl_bucket_1')
        bucket_standard = self.dest_cluster.get_bucket_by_name('standard_bucket_1')
        bucket_sasl_2 = self.dest_cluster.get_bucket_by_name('sasl_bucket_1')
        gen_create2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size, end=self.num_items)
        gen_delete2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size,
            start=int((self.num_items) * (float)(100 - self._perc_del) / 100), end=self.num_items)
        gen_update2 = BlobGenerator('loadTwo', 'loadTwo-', self._value_size, start=0,
            end=int(self.num_items * (float)(self._perc_upd) / 100))

        self._load_bucket(bucket_default, self.src_master, self.gen_create, 'create', exp=0)
        self._load_bucket(bucket_sasl, self.src_master, self.gen_create, 'create', exp=0)

        if self.pause_xdcr_cluster:
            for cluster in self.get_cb_clusters():
                for remote_cluster in cluster.get_remote_clusters():
                    remote_cluster.pause_all_replications()
        self._online_upgrade(self.src_nodes, self.servers[self.src_init + self.dest_init:])
        self.src_master = self.servers[self.src_init + self.dest_init]

        if not self.is_goxdcr_migration_successful(self.src_master):
            self.fail("C1: Metadata migration failed after old nodes were removed")

        self._load_bucket(bucket_standard, self.dest_master, self.gen_create, 'create', exp=0)
        self._load_bucket(bucket_default, self.src_master, self.gen_update, 'create', exp=self._expires)
        self._load_bucket(bucket_sasl, self.src_master, self.gen_update, 'create', exp=self._expires)
        self._install(self.src_nodes)
        self._online_upgrade(self.servers[self.src_init + self.dest_init:], self.src_nodes, False)
        self._load_bucket(bucket_sasl_2, self.dest_master, gen_create2, 'create', exp=0)
        self.src_master = self.servers[0]

        self.log.info("###### Upgrading C1: completed ######")

        self._install(self.servers[self.src_init + self.dest_init:])
        self.sleep(60)
        self._online_upgrade(self.dest_nodes, self.servers[self.src_init + self.dest_init:])
        self.dest_master = self.servers[self.src_init + self.dest_init]

        if not self.is_goxdcr_migration_successful(self.dest_master):
            self.fail("C2: Metadata migration failed after old nodes were removed")

        self._install(self.dest_nodes)
        self.sleep(60)
        if float(self.initial_version[:2]) >= 3.0 and self._demand_encryption:
            if not self.is_ssl_over_memcached(self.src_master):
                self.fail("C1: After old nodes were replaced, C1 still uses "
                          "proxy connection to C2 which is >= 3.0")
            if not self.is_ssl_over_memcached(self.dest_master):
                self.fail("C2: After old nodes were replaced, C2 still uses "
                          "proxy connection to C1 which is >= 3.0")

        self._online_upgrade(self.servers[self.src_init + self.dest_init:], self.dest_nodes, False)
        self.dest_master = self.servers[self.src_init]

        self.log.info("###### Upgrading C2: completed ######")

        if self.pause_xdcr_cluster:
            for cluster in self.get_cb_clusters():
                for remote_cluster in cluster.get_remote_clusters():
                    remote_cluster.resume_all_replications()

        self._load_bucket(bucket_default, self.src_master, self.gen_delete, 'delete', exp=0)
        self._load_bucket(bucket_sasl, self.src_master, self.gen_delete, 'delete', exp=0)
        self._load_bucket(bucket_standard, self.dest_master, self.gen_delete, 'delete', exp=0)
        self._load_bucket(bucket_sasl_2, self.dest_master, gen_delete2, 'delete', exp=0)

        self._wait_for_replication_to_catchup()
        self._post_upgrade_ops()
        self.sleep(120)
        self.verify_results()
        self.max_verify = None
        if self.ddocs_src:
            for bucket_name in self.buckets_on_src:
                bucket = self.src_cluster.get_bucket_by_name(bucket_name)
                expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()])
                self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_src, self.src_master)

        if self.ddocs_dest:
            for bucket_name in self.buckets_on_dest:
                bucket = self.dest_cluster.get_bucket_by_name(bucket_name)
                expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()])
                self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_dest, self.dest_master)

        if float(self.initial_version[:2]) == 3.1 and float(self.upgrade_versions[0][:2]) == 4.1:
            goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'
            for node in self.src_cluster.get_nodes():
                count = NodeHelper.check_goxdcr_log(
                            node,
                            "Failed to repair connections to target cluster",
                            goxdcr_log)
                self.assertEqual(count, 0, "Failed to repair connections to target cluster "
                                        "error message found in " + str(node.ip))
                self.log.info("Failed to repair connections to target cluster "
                                        "error message not found in " + str(node.ip))
Example #59
0
    def offline_cluster_upgrade(self):

        # install on src and dest nodes
        self._install(self.servers[:self.src_init + self.dest_init ])
        upgrade_nodes = self.input.param('upgrade_nodes', "src").split(";")

        self.create_buckets()
        self._join_all_clusters()
        if float(self.initial_version[:2]) < 3.0:
            self.pause_xdcr_cluster = None
        bucket = self.src_cluster.get_bucket_by_name('default')
        self._operations()
        self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0)
        bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1')
        self._load_bucket(bucket, self.src_master, self.gen_create, 'create', exp=0)
        bucket = self.dest_cluster.get_bucket_by_name('standard_bucket_1')
        gen_create2 = BlobGenerator('loadTwo', 'loadTwo', self._value_size, end=self.num_items)
        self._load_bucket(bucket, self.dest_master, gen_create2, 'create', exp=0)

        self._wait_for_replication_to_catchup()
        if self.pause_xdcr_cluster:
            for cluster in self.get_cb_clusters():
                for remote_cluster in cluster.get_remote_clusters():
                    remote_cluster.pause_all_replications()

        nodes_to_upgrade = []
        if "src" in upgrade_nodes:
            nodes_to_upgrade += self.src_nodes
        if "dest" in upgrade_nodes:
            nodes_to_upgrade += self.dest_nodes

        self._offline_upgrade(nodes_to_upgrade)

        self.log.info("######### Upgrade of C1 and C2 completed ##########")

        if not self.is_goxdcr_migration_successful(self.src_master):
            self.fail("C1: Metadata migration failed after offline upgrade of C1")

        if not self.is_goxdcr_migration_successful(self.dest_master):
            self.fail("C2: Metadata migration failed after offline upgrade of C2")

        if self._use_encryption_after_upgrade and "src" in upgrade_nodes and "dest" in upgrade_nodes and self.upgrade_versions[0] >= "2.5.0":
            if "src" in self._use_encryption_after_upgrade:
                for remote_cluster in self.src_cluster.get_remote_clusters():
                    remote_cluster._modify()
            if "dest" in self._use_encryption_after_upgrade:
                for remote_cluster in self.dest_cluster.get_remote_clusters():
                    remote_cluster._modify()
        self.sleep(60)

        if self._demand_encryption or self._use_encryption_after_upgrade:
            if not self.is_ssl_over_memcached(self.src_master):
                self.fail("C1: After old nodes were replaced, C1 still uses "
                          "ns_proxy connection to C2 which is >= 3.0")
            if not self.is_ssl_over_memcached(self.dest_master):
                self.fail("C2: After old nodes were replaced, C2 still uses "
                          "ns_proxy connection to C1 which is >= 3.0")

        bucket = self.src_cluster.get_bucket_by_name('sasl_bucket_1')
        gen_create3 = BlobGenerator('loadThree', 'loadThree', self._value_size, end=self.num_items)
        self._load_bucket(bucket, self.src_master, gen_create3, 'create', exp=0)
        bucket = self.dest_cluster.get_bucket_by_name('sasl_bucket_1')
        gen_create4 = BlobGenerator('loadFour', 'loadFour', self._value_size, end=self.num_items)
        self._load_bucket(bucket, self.dest_master, gen_create4, 'create', exp=0)
        if self.pause_xdcr_cluster:
            for cluster in self.get_cb_clusters():
                for remote_cluster in cluster.get_remote_clusters():
                    remote_cluster.resume_all_replications()
        bucket = self.src_cluster.get_bucket_by_name('default')
        gen_create5 = BlobGenerator('loadFive', 'loadFive', self._value_size, end=self.num_items)
        self._load_bucket(bucket, self.src_master, gen_create5, 'create', exp=0)
        self.merge_all_buckets()
        self.sleep(60)
        self._post_upgrade_ops()
        self.sleep(60)
        self.verify_results()
        self.max_verify = None
        if self.ddocs_src:
            for bucket_name in self.buckets_on_src:
                bucket = self.src_cluster.get_bucket_by_name(bucket_name)
                expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()])
                self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_src, self.src_master)

        if self.ddocs_dest:
            for bucket_name in self.buckets_on_dest:
                bucket = self.dest_cluster.get_bucket_by_name(bucket_name)
                expected_rows = sum([len(kv_store) for kv_store in bucket.kvs.values()])
                self._verify_ddocs(expected_rows, [bucket_name], self.ddocs_dest, self.dest_master)

        if float(self.initial_version[:2]) == 3.1 and float(self.upgrade_versions[0][:2]) == 4.1:
            goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'
            for node in self.src_cluster.get_nodes():
                count = NodeHelper.check_goxdcr_log(
                            node,
                            "Failed to repair connections to target cluster",
                            goxdcr_log)
                self.assertEqual(count, 0, "Failed to repair connections to target cluster "
                                        "error message found in " + str(node.ip))
                self.log.info("Failed to repair connections to target cluster "
                                        "error message not found in " + str(node.ip))
Example #60
0
    def test_rollback(self):
        bucket = self.src_cluster.get_buckets()[0]
        nodes = self.src_cluster.get_nodes()

        # Stop Persistence on Node A & Node B
        for node in nodes:
            mem_client = MemcachedClientHelper.direct_client(node, bucket)
            mem_client.stop_persistence()

        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'
        self.setup_xdcr()

        self.src_cluster.pause_all_replications()

        gen = BlobGenerator("C1-", "C1-", self._value_size, end=self._num_items)
        self.src_cluster.load_all_buckets_from_generator(gen)

        self.src_cluster.resume_all_replications()

        # Perform mutations on the bucket
        self.async_perform_update_delete()

        rest1 = RestConnection(self.src_cluster.get_master_node())
        rest2 = RestConnection(self.dest_cluster.get_master_node())

        # Fetch count of docs in src and dest cluster
        _count1 = rest1.fetch_bucket_stats(bucket=bucket.name)["op"]["samples"]["curr_items"][-1]
        _count2 = rest2.fetch_bucket_stats(bucket=bucket.name)["op"]["samples"]["curr_items"][-1]

        self.log.info("Before rollback src cluster count = {0} dest cluster count = {1}".format(_count1, _count2))

        # Kill memcached on Node A so that Node B becomes master
        shell = RemoteMachineShellConnection(self.src_cluster.get_master_node())
        shell.kill_memcached()

        # Start persistence on Node B
        mem_client = MemcachedClientHelper.direct_client(nodes[1], bucket)
        mem_client.start_persistence()

        # Failover Node B
        failover_task = self.src_cluster.async_failover()
        failover_task.result()

        # Wait for Failover & rollback to complete
        self.sleep(60)

        # Fetch count of docs in src and dest cluster
        _count1 = rest1.fetch_bucket_stats(bucket=bucket.name)["op"]["samples"]["curr_items"][-1]
        _count2 = rest2.fetch_bucket_stats(bucket=bucket.name)["op"]["samples"]["curr_items"][-1]

        self.log.info("After rollback src cluster count = {0} dest cluster count = {1}".format(_count1, _count2))

        self.assertTrue(self.src_cluster.wait_for_outbound_mutations(),
                        "Mutations in source cluster not replicated to target after rollback")
        self.log.info("Mutations in source cluster replicated to target after rollback")

        count = NodeHelper.check_goxdcr_log(
                        nodes[0],
                        "Received rollback from DCP stream",
                        goxdcr_log)
        self.assertGreater(count, 0, "rollback did not happen as expected")
        self.log.info("rollback happened as expected")