Ejemplo n.º 1
0
    def test_system_indexes_rebalance(self):
        index_names = []
        self.collections_helper.create_scope(bucket_name="default", scope_name='scope1')
        for y in range(0, 10):
            self.collections_helper.create_collection(bucket_name="default", scope_name='scope1',
                                                      collection_name="collection" + str(y))
        for i in range(0, 10):
            self.run_cbq_query("CREATE INDEX idx{0} on default:default.scope1.collection{0}(fake) WITH {{'nodes': '{1}:{2}'}}".format(i, self.servers[1].ip, self.servers[1].port))
            index_names.append(("idx" + str(i), "collection" + str(i)))

        # Rebalance in an index node
        rebalance = self.cluster.async_rebalance(self.servers, [self.servers[2]], [], services=["index"])
        reached = RestHelper(self.rest).rebalance_reached()
        self.assertTrue(reached, "rebalance failed, stuck or did not complete")
        rebalance.result()

        self.verify_all_indexes(index_names)

        # Rebalance out an index node
        rebalance = self.cluster.async_rebalance(self.servers, [], [self.servers[1]])
        reached = RestHelper(self.rest).rebalance_reached()
        self.assertTrue(reached, "rebalance failed, stuck or did not complete")
        rebalance.result()

        time.sleep(5)
        self.wait_for_all_indexes_online()
        self.verify_all_indexes(index_names)
Ejemplo n.º 2
0
    def remove_node(self, otpnode=None, wait_for_rebalance=True, rest=None):
        """
        Method to remove nodes from a cluster.
        :param otpnode: list of nodes to be removed.
        :param wait_for_rebalance: boolean, wait for rebalance to finish
                                   after removing the nodes.
        :param rest: RestConnection object
        """
        if not rest:
            rest = self.rest
        nodes = rest.node_statuses()
        '''This is the case when master node is running cbas service as well'''
        if len(nodes) <= len(otpnode):
            return

        helper = RestHelper(rest)
        try:
            removed = helper.remove_nodes(
                knownNodes=[node.id for node in nodes],
                ejectedNodes=[node.id for node in otpnode],
                wait_for_rebalance=wait_for_rebalance)
        except Exception:
            self.sleep(5, "Rebalance failed on Removal. Retry.. THIS IS A BUG")
            removed = helper.remove_nodes(
                knownNodes=[node.id for node in nodes],
                ejectedNodes=[node.id for node in otpnode],
                wait_for_rebalance=wait_for_rebalance)
        if wait_for_rebalance:
            self.assertTrue(
                removed,
                "Rebalance operation failed while removing %s" % otpnode)
Ejemplo n.º 3
0
 def terminate(self):
     if self._xdcr:
         self._terminate_replications(self._s_master, "cluster1")
         if self._rdirection == "bidirection":
             self._terminate_replications(self._d_master, "cluster0")
     for key in self._clusters_keys_olst:
         nodes = self._clusters_dic[key]
         for node in nodes:
             rest = RestConnection(node)
             buckets = rest.get_buckets()
             for bucket in buckets:
                 status = rest.delete_bucket(bucket.name)
                 if status:
                     self._log.info('Deleted bucket : {0} from {1}'.format(
                         bucket.name, node.ip))
         rest = RestConnection(nodes[0])
         helper = RestHelper(rest)
         servers = rest.node_statuses()
         master_id = rest.get_nodes_self().id
         if len(nodes) > 1:
             removed = helper.remove_nodes(
                 knownNodes=[node.id for node in servers],
                 ejectedNodes=[
                     node.id for node in servers if node.id != master_id
                 ],
                 wait_for_rebalance=True)
 def test_failure_scenarios_during_recovery_of_node_A(self):
     self.recovery_type = self.input.param("recovery_type", 'full')
     # enable auto failover and canAbortRebalance
     self.enable_autofailover_and_validate()
     self.sleep(5)
     # do a graceful failover
     self.cluster.failover([self.master], failover_nodes=[self.servers[self.server_index_to_fail]], graceful=True)
     # wait for failover to complete
     self.wait_for_failover_or_assert(1, 500)
     # do a delta recovery
     self.rest.set_recovery_type(otpNode='ns_1@' + self.servers[self.server_index_to_fail].ip,
                                 recoveryType=self.recovery_type)
     # Start rebalance of recovered nodes
     rebalance_task = self.cluster.async_rebalance(self.servers, [], [])
     reached = RestHelper(self.rest).rebalance_reached(percentage=30)
     self.assertTrue(reached, "Rebalance failed or did not reach {0}%".format(30))
     try:
         # Do a fail over action - reboot, hang, kill. This is defined in the conf file
         self.failover_actions[self.failover_action](self)
         rebalance_task.result()
     except Exception as ex:
         self.log.info("Rebalance failed with : {0}".format(str(ex)))
         if "Rebalance failed. See logs for detailed reason. You can try again" in str(ex):
             self.log.info(
                 "Rebalance failed even before auto-failover had a chance to stop it self.server_to_fail.ip: {0}".format(
                     str(ex)))
         elif not RestHelper(self.rest).is_cluster_rebalanced():
             if self._auto_failover_message_present_in_logs(self.server_to_fail[0].ip):
                 self.log.info("Rebalance interrupted due to auto-failover of nodes - message was seen in logs")
             else:
                 self.fail("Rebalance interrupted message was not seen in logs")
         else:
             self.fail("Rebalance was not aborted by auto fail-over")
     # Reset auto failover settings
     self.disable_autofailover_and_validate()
 def test_failure_scenarios_during_rebalance_in_of_node_A(self):
     # enable auto failover and canAbortRebalance
     self.enable_autofailover_and_validate()
     self.sleep(5)
     # Start rebalance in
     rebalance_task = self.cluster.async_rebalance(self.servers,
                                                   self.servers_to_add,
                                                   self.servers_to_remove)
     reached = RestHelper(self.rest).rebalance_reached(percentage=30)
     self.assertTrue(reached, "Rebalance failed or did not reach {0}%".format(30))
     try:
         # Do a fail over action - reboot, hang, kill. This is defined in the conf file
         self.failover_actions[self.failover_action](self)
         rebalance_task.result()
     except Exception as ex:
         self.log.info("Rebalance failed with : {0}".format(str(ex)))
         if "Rebalance failed. See logs for detailed reason. You can try again" in str(ex):
             self.log.info(
                 "Rebalance failed even before auto-failover had a chance to stop it self.server_to_fail.ip: {0}".format(
                     str(ex)))
         elif not RestHelper(self.rest).is_cluster_rebalanced():
             if self._auto_failover_message_present_in_logs(self.server_to_fail[0].ip):
                 self.log.info("Rebalance interrupted due to auto-failover of nodes - message was seen in logs")
             else:
                 self.fail("Rebalance interrupted message was not seen in logs")
         else:
             self.fail("Rebalance was not aborted by auto fail-over")
     # Reset auto failover settings
     self.disable_autofailover_and_validate()
Ejemplo n.º 6
0
    def test_stream_after_warmup(self):

        nodeA = self.servers[0]
        bucket = 'standard_bucket' + str(self.standard_buckets - 1)
        originalVbInfo = self.all_vb_info(nodeA, bucket=bucket)
        expectedVbSeqno = {}

        # load all buckets
        doc_gen = BlobGenerator('dcpdata',
                                'dcpdata-',
                                self.value_size,
                                end=self.num_items)
        self._load_all_buckets(self.master, doc_gen, "create", 0)
        self._wait_for_stats_all_buckets([nodeA])

        # store expected vb seqnos
        originalVbInfo = self.all_vb_info(nodeA, bucket=bucket)

        # restart node
        assert self.stop_node(0)
        time.sleep(5)
        assert self.start_node(0)
        rest = RestHelper(RestConnection(nodeA))
        assert rest.is_ns_server_running()
        time.sleep(2)

        # verify original vbInfo can be streamed
        dcp_client = self.dcp_client(nodeA, PRODUCER, bucket_name=bucket)
        for vbucket in originalVbInfo:
            vb_uuid, _, high_seqno = originalVbInfo[vbucket]
            stream = dcp_client.stream_req(vbucket, 0, 0, high_seqno, vb_uuid)
            responses = stream.run()
            assert high_seqno == stream.last_by_seqno
Ejemplo n.º 7
0
 def test_rebalance_in_query_node(self):
     self.with_retry(lambda: self.ensure_primary_indexes_exist(),
                     eval=None,
                     delay=3,
                     tries=5)
     self.run_cbq_query(
         query="PREPARE p1 from select * from default limit 5",
         server=self.servers[0])
     self.sleep(5)
     for i in range(self.nodes_init):
         self.run_cbq_query(query="execute p1", server=self.servers[i])
     services_in = ["n1ql", "index", "data"]
     rebalance = self.cluster.async_rebalance(
         self.servers[:self.nodes_init], [self.servers[self.nodes_init]],
         [],
         services=services_in)
     reached = RestHelper(self.rest).rebalance_reached()
     self.assertTrue(reached, "rebalance failed, stuck or did not complete")
     rebalance.result()
     self.sleep(30)
     try:
         for i in range(self.nodes_init + 1):
             self.run_cbq_query(query="execute '[%s:%s]p1'" %
                                (self.servers[0].ip, self.servers[0].port),
                                server=self.servers[i])
     finally:
         rebalance = self.cluster.async_rebalance(
             self.servers[:self.nodes_init], [],
             to_remove=[self.servers[self.nodes_init]])
         reached = RestHelper(self.rest).rebalance_reached()
         self.assertTrue(reached,
                         "rebalance failed, stuck or did not complete")
         rebalance.result()
Ejemplo n.º 8
0
    def test_crash_while_streaming(self):
        bucket = self.bucket_util.buckets[0]
        vbucket = randint(0, self.vbuckets)
        nodeA = self.servers[0]
        self.load_docs(bucket, vbucket, 0, self.num_items, "create")

        shell_conn = RemoteMachineShellConnection(nodeA)
        cb_stat_obj = Cbstats(shell_conn)

        dcp_client = self.dcp_client(nodeA, dcp.constants.PRODUCER)
        _ = dcp_client.stream_req(vbucket, 0, 0, 2 * self.num_items, 0)
        self.load_docs(nodeA, vbucket, self.num_items)
        self.assertTrue(self.stop_node(0), msg="Failed during stop_node")
        self.sleep(2, "Sleep after stop_node")
        self.assertTrue(self.start_node(0), msg="Failed during start_node")
        rest = RestHelper(RestConnection(nodeA))
        self.assertTrue(rest.is_ns_server_running(),
                        msg="Failed while is_ns_server_running check")
        self.sleep(30, "Sleep to wait for ns_server to run")

        vb_info = cb_stat_obj.vbucket_seqno(bucket.name)
        dcp_client = self.dcp_client(nodeA, dcp.constants.PRODUCER)
        stream = dcp_client.stream_req(vbucket, 0, 0,
                                       vb_info[vbucket]["high_seqno"], 0)
        stream.run()
        self.assertTrue(stream.last_by_seqno == vb_info[vbucket]["high_seqno"],
                        msg="Mismatch in high_seqno. {0} == {1}".format(
                            vb_info[vbucket]["high_seqno"],
                            stream.last_by_seqno))

        # Disconnect shell Connection for the node
        shell_conn.disconnect()
Ejemplo n.º 9
0
    def test_start_stop_rebalance(self):
        """
        Start-stop rebalance in/out with adding/removing aditional after stopping rebalance.

        This test begins by loading a given number of items into the cluster. It then
        add  servs_in nodes and remove  servs_out nodes and start rebalance. Then rebalance
        is stopped when its progress reached 20%. After we add  extra_nodes_in and remove
        extra_nodes_out. Restart rebalance with new cluster configuration. Later rebalance
        will be stop/restart on progress 40/60/80%. After each iteration we wait for
        the disk queues to drain, and then verify that there has been no data loss,
        sum(curr_items) match the curr_items_total. Once cluster was rebalanced the test is finished.
        The oder of add/remove nodes looks like:
        self.nodes_init|servs_in|extra_nodes_in|extra_nodes_out|servs_out
        """
        rest = RestConnection(self.cluster.master)
        self.bucket_util._wait_for_stats_all_buckets()
        self.log.info("Current nodes : {0}".format(
            [node.id for node in rest.node_statuses()]))
        self.log.info("Adding nodes {0} to cluster".format(self.servs_in))
        self.log.info("Removing nodes {0} from cluster".format(self.servs_out))
        add_in_once = self.extra_servs_in
        _ = set(self.servs_init + self.servs_in) - set(self.servs_out)
        # the latest iteration will be with i=5, for this case rebalance should be completed,
        # that also is verified and tracked
        for i in range(1, 6):
            if i == 1:
                rebalance = self.task.async_rebalance(
                    self.servs_init[:self.nodes_init], self.servs_in,
                    self.servs_out)
            else:
                rebalance = self.task.async_rebalance(
                    self.servs_init[:self.nodes_init] + self.servs_in,
                    add_in_once, self.servs_out + self.extra_servs_out)
                add_in_once = []
                _ = set(self.servs_init + self.servs_in + self.extra_servs_in) \
                    - set(self.servs_out + self.extra_servs_out)
            self.sleep(20)
            expected_progress = 20 * i
            reached = RestHelper(rest).rebalance_reached(expected_progress)
            self.assertTrue(
                reached, "Rebalance failed or did not reach {0}%".format(
                    expected_progress))
            if not RestHelper(rest).is_cluster_rebalanced():
                self.log.info("Stop the rebalance")
                stopped = rest.stop_rebalance(wait_timeout=self.wait_timeout /
                                              3)
                self.assertTrue(stopped, msg="Unable to stop rebalance")
            self.task_manager.get_task_result(rebalance)
            if RestHelper(rest).is_cluster_rebalanced():
                self.validate_docs()
                self.log.info(
                    "Rebalance was completed when tried to stop rebalance on {0}%"
                    .format(str(expected_progress)))
                break
            else:
                self.log.info(
                    "Rebalance is still required. Verifying the data in the buckets"
                )
                self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.verify_unacked_bytes_all_buckets()
Ejemplo n.º 10
0
 def test_node_reboot(self):
     wait_timeout = 120
     timeout = self.timeout / 2
     status = self.rest.update_autoreprovision_settings(True, 1)
     if not status:
         self.fail('failed to change autoreprovision_settings!')
     self.sleep(5)
     shell = RemoteMachineShellConnection(self.server_fail)
     if shell.extract_remote_info().type.lower() == 'windows':
         o, r = shell.execute_command("shutdown -r -f -t 0")
     elif shell.extract_remote_info().type.lower() == 'linux':
         o, r = shell.execute_command("reboot")
     shell.log_command_output(o, r)
     if shell.extract_remote_info().type.lower() == 'windows':
         time.sleep(wait_timeout * 5)
     else:
         time.sleep(wait_timeout)
     # disable firewall on the node
     shell = RemoteMachineShellConnection(self.server_fail)
     shell.disable_firewall()
     AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                         timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                         self)
     helper = RestHelper(self.rest)
     self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
     self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
     self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
     self.assertTrue(self.rest.monitorRebalance())
     buckets = self.rest.get_buckets()
     for bucket in buckets:
         self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
Ejemplo n.º 11
0
 def test_rename_rebalance_start_stop(self):
     expected_progress = self.input.param('expected_progress', 30)
     if len(self.servers) < 2:
         self.fail("test require more than 1 node")
     hostnames = self.rename_nodes(self.servers[:self.nodes_in +
                                                self.nodes_init])
     self._set_hostames_to_servers_objs(hostnames)
     self.verify_referenced_by_names(
         self.servers[:self.nodes_in + self.nodes_init], hostnames)
     rebalance = self.cluster.async_rebalance(
         self.servers[:self.nodes_init],
         self.servers[self.nodes_init:self.nodes_in + self.nodes_init], [],
         use_hostnames=True)
     self.sleep(3, 'wait for some progress in rebalance...')
     rest = RestConnection(self.master)
     reached = RestHelper(rest).rebalance_reached(expected_progress)
     self.assertTrue(
         reached,
         "rebalance failed or did not reach {0}%".format(expected_progress))
     if not RestHelper(rest).is_cluster_rebalanced():
         stopped = rest.stop_rebalance(wait_timeout=self.wait_timeout // 3)
         self.assertTrue(stopped, msg="unable to stop rebalance")
     self.verify_referenced_by_names(
         self.servers[:self.nodes_in + self.nodes_init], hostnames)
     self.cluster.rebalance(self.servers[:self.nodes_init +
                                         self.nodes_init], [], [],
                            use_hostnames=True)
     self.verify_referenced_by_names(
         self.servers[:self.nodes_in + self.nodes_init], hostnames)
 def test_failure_scenarios_during_rebalance_out_of_failedover_node_A(self):
     # enable auto failover and canAbortRebalance
     self.enable_autofailover_and_validate()
     # failover a node
     self.cluster.failover([self.master], failover_nodes=[self.servers[self.server_index_to_fail]], graceful=False)
     # wait for failover to complete
     self.wait_for_failover_or_assert(1, 500)
     # Start rebalance out
     rebalance_task = self.cluster.async_rebalance(self.servers,
                                                   [],
                                                   [self.servers[self.server_index_to_fail]])
     reached = RestHelper(self.rest).rebalance_reached(percentage=30)
     self.assertTrue(reached, "Rebalance failed or did not reach {0}%".format(30))
     try:
         # Do a fail over action - reboot, hang, kill. This is defined in the conf file
         self.failover_actions[self.failover_action](self)
         rebalance_task.result()
     except Exception as ex:
         self.log.info("Rebalance failed with : {0}".format(str(ex)))
         if "Rebalance failed. See logs for detailed reason. You can try again" in str(ex):
             self.fail("Rebalance failed when it was not expected to fail".format(str(ex)))
         elif not RestHelper(self.rest).is_cluster_rebalanced():
             if self._auto_failover_message_present_in_logs(self.server_to_fail[0].ip):
                 self.fail("Rebalance interrupted due to auto-failover of nodes - It was not expected")
             else:
                 self.log.info("Rebalance was not interrupted as expected")
         else:
             self.log.info("Rebalance completes successfully")
     # Reset auto failover settings
     self.disable_autofailover_and_validate()
Ejemplo n.º 13
0
    def test_stream_after_warmup(self):

        nodeA = self.servers[0]
        bucket = 'standard_bucket'+str(self.standard_buckets-1)
        originalVbInfo = self.all_vb_info(nodeA, bucket = bucket)
        expectedVbSeqno = {}

        # load all buckets
        doc_gen = BlobGenerator(
            'dcpdata', 'dcpdata-', self.value_size, end=self.num_items)
        self._load_all_buckets(self.master, doc_gen, "create", 0)
        self._wait_for_stats_all_buckets([nodeA])

        # store expected vb seqnos
        originalVbInfo = self.all_vb_info(nodeA, bucket = bucket)


        # restart node
        assert self.stop_node(0)
        time.sleep(5)
        assert self.start_node(0)
        rest = RestHelper(RestConnection(nodeA))
        assert  rest.is_ns_server_running()
        time.sleep(2)

        # verify original vbInfo can be streamed
        dcp_client = self.dcp_client(nodeA, PRODUCER, auth_user = bucket)
        for vbucket in originalVbInfo:
            vb_uuid, _, high_seqno = originalVbInfo[vbucket]
            stream = dcp_client.stream_req(vbucket, 0, 0, high_seqno, vb_uuid)
            responses = stream.run()
            assert high_seqno == stream.last_by_seqno
Ejemplo n.º 14
0
 def test_setting_propogation_swap_rebalance(self):
     expected_curl = self.set_tmpspace()
     self.assertEqual(expected_curl['queryTmpSpaceSize'], self.tmp_size)
     expected_dir = self.set_directory()
     self.assertEqual(expected_dir['queryTmpSpaceDir'], self.directory_path)
     nodes_out_list = self.servers[1]
     to_add_nodes = [self.servers[self.nodes_init + 1]]
     to_remove_nodes = [nodes_out_list]
     services_in = ["index", "n1ql", "data"]
     # do a swap rebalance
     rebalance = self.cluster.async_rebalance(
         self.servers[:self.nodes_init],
         to_add_nodes, [],
         services=services_in)
     reached = RestHelper(self.rest).rebalance_reached()
     self.assertTrue(reached, "rebalance failed, stuck or did not complete")
     rebalance.result()
     rebalance = self.cluster.async_rebalance(
         self.servers[:self.nodes_init + 1], [], to_remove_nodes)
     reached = RestHelper(self.rest).rebalance_reached()
     self.assertTrue(reached, "rebalance failed, stuck or did not complete")
     self.sleep(5)
     curl_url = "http://%s:%s/settings/querySettings" % (self.servers[
         self.nodes_init + 1].ip, self.servers[self.nodes_init + 1].port)
     curl_output = self.shell.execute_command(
         "%s -u Administrator:password %s" % (self.curl_path, curl_url))
     expected_curl = self.convert_list_to_json(curl_output[0])
     self.assertEqual(expected_curl['queryTmpSpaceSize'], self.tmp_size)
     self.assertEqual(expected_curl['queryTmpSpaceDir'],
                      self.directory_path)
Ejemplo n.º 15
0
 def test_node_cb_restart(self):
     timeout = self.timeout / 2
     status = self.rest.update_autoreprovision_settings(True, 1)
     if not status:
         self.fail('failed to change autoreprovision_settings!')
     self.sleep(5)
     shell = RemoteMachineShellConnection(self.server_fail)
     shell.restart_couchbase()
     AutoReprovisionBaseTest.wait_for_failover_or_assert(
         self.master, 1,
         timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self)
     AutoReprovisionBaseTest.wait_for_failover_or_assert(
         self.master, 0,
         timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self)
     self.sleep(5)
     helper = RestHelper(self.rest)
     self.assertTrue(helper.is_cluster_healthy(),
                     "cluster status is not healthy")
     self.assertFalse(helper.is_cluster_rebalanced(),
                      "cluster is not balanced")
     self.rest.rebalance(
         otpNodes=[node.id for node in self.rest.node_statuses()],
         ejectedNodes=[])
     self.assertTrue(self.rest.monitorRebalance())
     buckets = self.rest.get_buckets()
     for bucket in buckets:
         self.verify_loaded_data(self.master, bucket.name,
                                 self.loaded_items[bucket.name])
Ejemplo n.º 16
0
    def test_crash_entire_cluster(self):

        self.cluster.rebalance(
            [self.master],
            self.servers[1:], [])

        vbucket = 0
        nodeA = self.servers[0]
        n = 10000
        self.load_docs(nodeA, vbucket, n)

        dcp_client = self.dcp_client(nodeA, PRODUCER)
        stream = dcp_client.stream_req(vbucket, 0, 0, 2*n, 0)
        self.load_docs(nodeA, vbucket, n)

        # stop all nodes
        node_range = list(range(len(self.servers)))
        for i in node_range:
            assert self.stop_node(i)
        time.sleep(2)

        # start all nodes in reverse order
        node_range.reverse()
        for i in node_range:
            assert self.start_node(i)

        rest = RestHelper(RestConnection(nodeA))
        assert rest.is_ns_server_running()

        _, _, high_seqno = self.vb_info(nodeA, vbucket)
        dcp_client = self.dcp_client(nodeA, PRODUCER)
        stream = dcp_client.stream_req(vbucket, 0, 0, high_seqno, 0)
        stream.run()
        assert stream.last_by_seqno == high_seqno
Ejemplo n.º 17
0
 def test_node_reboot(self):
     wait_timeout = 120
     timeout = self.timeout / 2
     status = self.rest.update_autoreprovision_settings(True, 1)
     if not status:
         self.fail('failed to change autoreprovision_settings!')
     self.sleep(5)
     shell = RemoteMachineShellConnection(self.server_fail)
     if shell.extract_remote_info().type.lower() == 'windows':
         o, r = shell.execute_command("shutdown -r -f -t 0")
     elif shell.extract_remote_info().type.lower() == 'linux':
         o, r = shell.execute_command("reboot")
     shell.log_command_output(o, r)
     if shell.extract_remote_info().type.lower() == 'windows':
         time.sleep(wait_timeout * 5)
     else:
         time.sleep(wait_timeout)
     # disable firewall on the node
     shell = RemoteMachineShellConnection(self.server_fail)
     shell.disable_firewall()
     AutoReprovisionBaseTest.wait_for_failover_or_assert(
         self.master, 0,
         timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self)
     helper = RestHelper(self.rest)
     self.assertTrue(helper.is_cluster_healthy(),
                     "cluster status is not healthy")
     self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
     self.rest.rebalance(
         otpNodes=[node.id for node in self.rest.node_statuses()],
         ejectedNodes=[])
     self.assertTrue(self.rest.monitorRebalance())
     buckets = self.rest.get_buckets()
     for bucket in buckets:
         self.verify_loaded_data(self.master, bucket.name,
                                 self.loaded_items[bucket.name])
Ejemplo n.º 18
0
    def test_crash_entire_cluster(self):

        self.cluster.rebalance(
            [self.master],
            self.servers[1:], [])


        vbucket = 0
        nodeA = self.servers[0]
        n = 10000
        self.load_docs(nodeA, vbucket, n)

        dcp_client = self.dcp_client(nodeA, PRODUCER)
        stream = dcp_client.stream_req(vbucket, 0, 0, 2*n, 0)
        self.load_docs(nodeA, vbucket, n)

        # stop all nodes
        node_range = range(len(self.servers))
        for i in node_range:
            assert self.stop_node(i)
        time.sleep(2)

        # start all nodes in reverse order
        node_range.reverse()
        for i in node_range:
            assert self.start_node(i)

        rest = RestHelper(RestConnection(nodeA))
        assert rest.is_ns_server_running()

        _, _, high_seqno = self.vb_info(nodeA, vbucket)
        dcp_client = self.dcp_client(nodeA, PRODUCER)
        stream = dcp_client.stream_req(vbucket, 0, 0, high_seqno, 0)
        stream.run()
        assert stream.last_by_seqno == high_seqno
 def test_permissions(self):
     shell = RemoteMachineShellConnection(self.master)
     info = shell.extract_remote_info()
     if info.type.lower() == 'windows':
         self.log.info('Test is designed for linux only')
         return
     shell.execute_command('chmod 000 %s' % LINUX_CB_PATH)
     self.sleep(10, 'wait for couchbase stopping')
     shell.execute_command('chmod 755 %s' % LINUX_CB_PATH)
     self.sleep(10, 'wait for couchbase start')
     try:
         rest = RestConnection(self.master)
         self.assertTrue(
             RestHelper(rest).is_ns_server_running(timeout_in_seconds=60),
             'NS server is not up')
     except Exception as ex:
         self.log.error('Couchbase is not running')
         shell.execute_command('reboot')
         self.sleep(60, 'wait for reboot of VM')
         rest = RestConnection(self.master)
         self.assertTrue(
             RestHelper(rest).is_ns_server_running(timeout_in_seconds=60),
             'NS server is not up')
         raise ex
     finally:
         shell.disconnect()
Ejemplo n.º 20
0
 def test_query_swap_rebalance(self):
     self.run_cbq_query(query="PREPARE p1 from select * from default limit 5", server=self.servers[0])
     self.sleep(5)
     for i in range(self.nodes_init):
         if not self.servers[i] == self.servers[1]:
             self.run_cbq_query(query="execute p1", server=self.servers[i])
     nodes_out_list = self.get_nodes_from_services_map(service_type="index", get_all_nodes=False)
     to_add_nodes = [self.servers[self.nodes_init + 2]]
     to_remove_nodes = [nodes_out_list]
     services_in = ["index", "n1ql", "data"]
     self.log.info(self.servers[:self.nodes_init])
     # do a swap rebalance
     rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init], to_add_nodes, [], services=services_in)
     reached = RestHelper(self.rest).rebalance_reached()
     self.assertTrue(reached, "rebalance failed, stuck or did not complete")
     rebalance.result()
     rebalance = self.cluster.async_rebalance(self.servers[:self.nodes_init + 2], [], to_remove_nodes)
     reached = RestHelper(self.rest).rebalance_reached()
     self.assertTrue(reached, "rebalance failed, stuck or did not complete")
     rebalance.result()
     self.sleep(30)
     for i in range(self.nodes_init):
         if not self.servers[i] == self.servers[1]:
             self.run_cbq_query(query="execute '[%s:%s]p1'" % (self.servers[2].ip, self.servers[2].port),
                                            server=self.servers[i])
 def backup_restore(self):
     try:
         backup_start = self.backups[int(self.backupset.start) - 1]
     except IndexError:
         backup_start = "{0}{1}".format(self.backups[-1], self.backupset.start)
     try:
         backup_end = self.backups[int(self.backupset.end) - 1]
     except IndexError:
         backup_end = "{0}{1}".format(self.backups[-1], self.backupset.end)
     args = "restore --archive {0} --repo {1} --host http://{2}:{3} --username {4} --password {5} --start {6} " \
            "--end {7}".format(self.backupset.directory, self.backupset.name,
                                               self.backupset.restore_cluster_host.ip,
                                               self.backupset.restore_cluster_host.port,
                                               self.backupset.restore_cluster_host_username,
                                               self.backupset.restore_cluster_host_password, backup_start,
                                               backup_end)
     if self.backupset.exclude_buckets:
         args += " --exclude-buckets {0}".format(self.backupset.exclude_buckets)
     if self.backupset.include_buckets:
         args += " --include-buckets {0}".format(self.backupset.include_buckets)
     if self.backupset.disable_bucket_config:
         args += " --disable-bucket-config {0}".format(self.backupset.disable_bucket_config)
     if self.backupset.disable_views:
         args += " --disable-views {0}".format(self.backupset.disable_views)
     if self.backupset.disable_gsi_indexes:
         args += " --disable-gsi-indexes {0}".format(self.backupset.disable_gsi_indexes)
     if self.backupset.disable_ft_indexes:
         args += " --disable-ft-indexes {0}".format(self.backupset.disable_ft_indexes)
     if self.backupset.disable_data:
         args += " --disable-data {0}".format(self.backupset.disable_data)
     if self.backupset.filter_keys:
         args += " --filter_keys {0}".format(self.backupset.filter_keys)
     if self.backupset.filter_values:
         args += " --filter_values {0}".format(self.backupset.filter_values)
     if self.backupset.force_updates:
         args += " --force-updates"
     if self.no_progress_bar:
         args += " --no-progress-bar"
     if not self.skip_buckets:
         rest_conn = RestConnection(self.backupset.restore_cluster_host)
         rest_helper = RestHelper(rest_conn)
         for bucket in self.buckets:
             if not rest_helper.bucket_exists(bucket.name):
                 self.log.info("Creating bucket {0} in restore host {1}".format(bucket.name,
                                                                                self.backupset.restore_cluster_host.ip))
                 rest_conn.create_bucket(bucket=bucket.name,
                                         ramQuotaMB=512,
                                         authType=bucket.authType if bucket.authType else 'none',
                                         proxyPort=bucket.port,
                                         saslPassword=bucket.saslPassword)
                 bucket_ready = rest_helper.vbucket_map_ready(bucket.name)
                 if not bucket_ready:
                     self.fail("Bucket %s not created after 120 seconds." % bucket.name)
     remote_client = RemoteMachineShellConnection(self.backupset.backup_host)
     command = "{0}/cbbackupmgr {1}".format(self.cli_command_location, args)
     output, error = remote_client.execute_command(command)
     remote_client.log_command_output(output, error)
     return output, error
Ejemplo n.º 22
0
 def run_failover_operations_with_ops(self, chosen, failover_reason):
     """ Method to run fail over operations used in the test scenario based on failover reason """
     # Perform Operations relalted to failover
     failed_over = True
     for node in chosen:
         unreachable = False
         if failover_reason == 'stop_server':
             unreachable = True
             self.stop_server(node)
             self.log.info("10 seconds delay to wait for membase-server to shutdown")
             # wait for 5 minutes until node is down
             self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300),
                                 msg="node status is not unhealthy even after waiting for 5 minutes")
         elif failover_reason == "firewall":
             unreachable = True
             self.filter_list.append (node.ip)
             server = [srv for srv in self.servers if node.ip == srv.ip][0]
             RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
             status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300)
             if status:
                 self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
             else:
                 # verify iptables on the node if something wrong
                 for server in self.servers:
                     if server.ip == node.ip:
                         shell = RemoteMachineShellConnection(server)
                         info = shell.extract_remote_info()
                         if info.type.lower() == "windows":
                             o, r = shell.execute_command("netsh advfirewall show allprofiles")
                             shell.log_command_output(o, r)
                         else:
                             o, r = shell.execute_command("/sbin/iptables --list")
                             shell.log_command_output(o, r)
                         shell.disconnect()
                 self.rest.print_UI_logs()
                 api = self.rest.baseUrl + 'nodeStatuses'
                 status, content, header = self.rest._http_request(api)
                 json_parsed = json.loads(content)
                 self.log.info("nodeStatuses: {0}".format(json_parsed))
                 self.fail("node status is not unhealthy even after waiting for 5 minutes")
     nodes = self.filter_servers(self.servers, chosen)
     failed_over = self.cluster.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful)
     # Perform Compaction
     compact_tasks = []
     if self.compact:
         for bucket in self.buckets:
             compact_tasks.append(self.cluster.async_compact_bucket(self.master, bucket))
     # Run View Operations
     if self.withViewsOps:
         self.query_and_monitor_view_tasks(nodes)
     # Run mutation operations
     if self.withMutationOps:
         self.run_mutation_operations()
     failed_over.result()
     for task in compact_tasks:
         task.result()
     msg = "rebalance failed while removing failover nodes {0}".format(node.id)
     self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
Ejemplo n.º 23
0
    def cleanup_cluster(servers, wait_for_rebalance=True, master = None):
        log = logger.Logger.get_logger()
        if master is None:
            master = servers[0]
        rest = RestConnection(master)
        helper = RestHelper(rest)
        helper.is_ns_server_running(timeout_in_seconds=testconstants.NS_SERVER_TIMEOUT)
        nodes = rest.node_statuses()
        master_id = rest.get_nodes_self().id
        for node in nodes:
            if int(node.port) in xrange(9091, 9991):
                rest.eject_node(node)
                nodes.remove(node)

        if len(nodes) > 1:
            log.info("rebalancing all nodes in order to remove nodes")
            rest.log_client_error("Starting rebalance from test, ejected nodes %s" % \
                                                             [node.id for node in nodes if node.id != master_id])
            removed = helper.remove_nodes(knownNodes=[node.id for node in nodes],
                                          ejectedNodes=[node.id for node in nodes if node.id != master_id],
                                          wait_for_rebalance=wait_for_rebalance)
            success_cleaned = []
            for removed in [node for node in nodes if (node.id != master_id)]:
                removed.rest_password = servers[0].rest_password
                removed.rest_username = servers[0].rest_username
                try:
                    rest = RestConnection(removed)
                except Exception as ex:
                    log.error("can't create rest connection after rebalance out for ejected nodes,\
                        will retry after 10 seconds according to MB-8430: {0} ".format(ex))
                    time.sleep(10)
                    rest = RestConnection(removed)
                start = time.time()
                while time.time() - start < 30:
                    if len(rest.get_pools_info()["pools"]) == 0:
                        success_cleaned.append(removed)
                        break
                    else:
                        time.sleep(0.1)
                if time.time() - start > 10:
                    log.error("'pools' on node {0}:{1} - {2}".format(
                           removed.ip, removed.port, rest.get_pools_info()["pools"]))
            for node in set([node for node in nodes if (node.id != master_id)]) - set(success_cleaned):
                log.error("node {0}:{1} was not cleaned after removing from cluster".format(
                           removed.ip, removed.port))
                try:
                    rest = RestConnection(node)
                    rest.force_eject_node()
                except Exception as ex:
                    log.error("force_eject_node {0}:{1} failed: {2}".format(removed.ip, removed.port, ex))
            if len(set([node for node in nodes if (node.id != master_id)])\
                    - set(success_cleaned)) != 0:
                raise Exception("not all ejected nodes were cleaned successfully")

            log.info("removed all the nodes from cluster associated with {0} ? {1}".format(servers[0], \
                    [(node.id, node.port) for node in nodes if (node.id != master_id)]))
Ejemplo n.º 24
0
    def cleanup_cluster(servers, wait_for_rebalance=True, master = None):
        log = logger.Logger.get_logger()
        if master == None:
            master = servers[0]
        rest = RestConnection(master)
        helper = RestHelper(rest)
        helper.is_ns_server_running(timeout_in_seconds=testconstants.NS_SERVER_TIMEOUT)
        nodes = rest.node_statuses()
        master_id = rest.get_nodes_self().id
        for node in nodes:
            if int(node.port) in xrange(9091, 9991):
                rest.eject_node(node)
                nodes.remove(node)

        if len(nodes) > 1:
            log.info("rebalancing all nodes in order to remove nodes")
            rest.log_client_error("Starting rebalance from test, ejected nodes %s" % \
                                                             [node.id for node in nodes if node.id != master_id])
            removed = helper.remove_nodes(knownNodes=[node.id for node in nodes],
                                          ejectedNodes=[node.id for node in nodes if node.id != master_id],
                                          wait_for_rebalance=wait_for_rebalance)
            success_cleaned = []
            for removed in [node for node in nodes if (node.id != master_id)]:
                removed.rest_password = servers[0].rest_password
                removed.rest_username = servers[0].rest_username
                try:
                    rest = RestConnection(removed)
                except Exception as ex:
                    log.error("can't create rest connection after rebalance out for ejected nodes,\
                        will retry after 10 seconds according to MB-8430: {0} ".format(ex))
                    time.sleep(10)
                    rest = RestConnection(removed)
                start = time.time()
                while time.time() - start < 30:
                    if len(rest.get_pools_info()["pools"]) == 0:
                        success_cleaned.append(removed)
                        break
                    else:
                        time.sleep(0.1)
                if time.time() - start > 10:
                    log.error("'pools' on node {0}:{1} - {2}".format(
                           removed.ip, removed.port, rest.get_pools_info()["pools"]))
            for node in set([node for node in nodes if (node.id != master_id)]) - set(success_cleaned):
                log.error("node {0}:{1} was not cleaned after removing from cluster".format(
                           removed.ip, removed.port))
                try:
                    rest = RestConnection(node)
                    rest.force_eject_node()
                except Exception as ex:
                    log.error("force_eject_node {0}:{1} failed: {2}".format(removed.ip, removed.port, ex))
            if len(set([node for node in nodes if (node.id != master_id)])\
                    - set(success_cleaned)) != 0:
                raise Exception("not all ejected nodes were cleaned successfully")

            log.info("removed all the nodes from cluster associated with {0} ? {1}".format(servers[0], \
                    [(node.id, node.port) for node in nodes if (node.id != master_id)]))
Ejemplo n.º 25
0
 def wait_for_bucket_creation(bucket, rest, timeout_in_seconds=120):
     log.info('waiting for bucket creation to complete....')
     start = time.time()
     helper = RestHelper(rest)
     while (time.time() - start) <= timeout_in_seconds:
         if helper.bucket_exists(bucket):
             return True
         else:
             time.sleep(2)
     return False
Ejemplo n.º 26
0
 def wait_for_bucket_creation(bucket, rest, timeout_in_seconds=120):
     log.info("waiting for bucket creation to complete....")
     start = time.time()
     helper = RestHelper(rest)
     while (time.time() - start) <= timeout_in_seconds:
         if helper.bucket_exists(bucket):
             return True
         else:
             time.sleep(2)
     return False
Ejemplo n.º 27
0
 def wait_for_bucket_deletion(bucket, rest, timeout_in_seconds=120):
     log = logger.Logger.get_logger()
     log.info('waiting for bucket deletion to complete....')
     start = time.time()
     helper = RestHelper(rest)
     while (time.time() - start) <= timeout_in_seconds:
         if not helper.bucket_exists(bucket):
             return True
         else:
             time.sleep(0.1)
     return False
Ejemplo n.º 28
0
 def _create_default_bucket(self):
     helper = RestHelper(self.rest)
     if not helper.bucket_exists(self.bucket):
         node_ram_ratio = BucketOperationHelper.base_bucket_ratio(self.servers)
         info = self.rest.get_nodes_self()
         available_ram = int(info.memoryQuota * node_ram_ratio)
         if available_ram < 256:
             available_ram = 256
         self.rest.create_bucket(bucket=self.bucket, ramQuotaMB=available_ram)
         ready = BucketOperationHelper.wait_for_memcached(self.master, self.bucket)
         self.testcase.assertTrue(ready, "wait_for_memcached failed")
     self.testcase.assertTrue(helper.bucket_exists(self.bucket), "unable to create {0} bucket".format(self.bucket))
Ejemplo n.º 29
0
    def test_reset_count(self):
        timeout = self.timeout / 2
        server_fail1 = self.servers[1]
        server_fail2 = self.servers[2]
        status = self.rest.update_autoreprovision_settings(True, 2)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        self.log.info("stopping the first server")
        self._stop_couchbase(server_fail1)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(
            self.master, 1,
            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self)

        self.log.info("resetting the autoreprovision count")
        if not self.rest.reset_autoreprovision():
            self.fail('failed to reset autoreprovision count!')

        self.log.info("stopping the second server")
        self._stop_couchbase(server_fail2)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(
            self.master, 2,
            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME, self)

        settings = self.rest.get_autoreprovision_settings()
        self.assertEquals(settings.enabled, True)
        self.assertEquals(settings.max_nodes, 2)
        self.assertEquals(settings.count, 0)

        self._start_couchbase(server_fail2)
        self._start_couchbase(server_fail1)
        self.sleep(30)
        settings = self.rest.get_autoreprovision_settings()
        self.assertEquals(settings.enabled, True)
        self.assertEquals(settings.max_nodes, 2)
        self.assertEquals(settings.count, 2)
        self.log.info("resetting the autoreprovision count")
        if not self.rest.reset_autoreprovision():
            self.fail('failed to reset autoreprovision count!')
        settings = self.rest.get_autoreprovision_settings()
        self.assertEquals(settings.enabled, True)
        self.assertEquals(settings.max_nodes, 2)
        self.assertEquals(settings.count, 0)

        helper = RestHelper(self.rest)
        self.assertTrue(helper.is_cluster_healthy(),
                        "cluster status is not healthy")
        self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
        self.rest.rebalance(
            otpNodes=[node.id for node in self.rest.node_statuses()],
            ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())
Ejemplo n.º 30
0
 def _create_default_bucket(self, replica=1):
     name = "default"
     master = self.servers[0]
     rest = RestConnection(master)
     helper = RestHelper(RestConnection(master))
     if not helper.bucket_exists(name):
         node_ram_ratio = BucketOperationHelper.base_bucket_ratio(self.servers)
         info = rest.get_nodes_self()
         available_ram = info.memoryQuota * node_ram_ratio
         rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram), replicaNumber=replica)
         ready = BucketOperationHelper.wait_for_memcached(master, name)
         self.assertTrue(ready, msg="wait_for_memcached failed")
     self.assertTrue(helper.bucket_exists(name), msg="unable to create {0} bucket".format(name))
Ejemplo n.º 31
0
 def cleanup_cluster(servers, wait_for_rebalance=True):
     log = logger.Logger.get_logger()
     rest = RestConnection(servers[0])
     helper = RestHelper(rest)
     helper.is_ns_server_running(timeout_in_seconds=testconstants.NS_SERVER_TIMEOUT)
     nodes = rest.node_statuses()
     master_id = rest.get_nodes_self().id
     if len(nodes) > 1:
         log.info("rebalancing all nodes in order to remove nodes")
         removed = helper.remove_nodes(knownNodes=[node.id for node in nodes],
                                       ejectedNodes=[node.id for node in nodes if node.id != master_id],
                                       wait_for_rebalance=wait_for_rebalance)
         log.info("removed all the nodes from cluster associated with {0} ? {1}".format(servers[0], removed))
Ejemplo n.º 32
0
 def wait_for_bucket_deletion(bucket,
                              rest,
                              timeout_in_seconds=120):
     log = logger.Logger.get_logger()
     log.info('waiting for bucket deletion to complete....')
     start = time.time()
     helper = RestHelper(rest)
     while (time.time() - start) <= timeout_in_seconds:
         if not helper.bucket_exists(bucket):
             return True
         else:
             time.sleep(2)
     return False
Ejemplo n.º 33
0
 def test_failed_rebalance_with_gsi_autofailover(self):
     self.bucket_params = self._create_bucket_params(
         server=self.master,
         size=self.bucket_size,
         replicas=self.num_replicas,
         bucket_type=self.bucket_type,
         enable_replica_index=self.enable_replica_index,
         eviction_policy=self.eviction_policy,
         lww=self.lww)
     self.cluster.create_standard_bucket(name=self.test_bucket,
                                         port=11222,
                                         bucket_params=self.bucket_params)
     self.buckets = self.rest.get_buckets()
     self.prepare_collection_for_indexing(num_of_docs_per_collection=10**5)
     self._create_indexes()
     # enable auto failover
     self.enable_autofailover_and_validate()
     # Start rebalance in
     rebalance_task = self.cluster.async_rebalance(
         servers=self.servers,
         to_add=self.servers_to_add,
         to_remove=self.servers_to_remove,
         services=['kv', 'index'])
     self.sleep(20)
     reached = RestHelper(self.rest).rebalance_reached(percentage=20)
     self.assertTrue(reached,
                     "Rebalance failed or did not reach {0}%".format(20))
     # Do a fail over action - reboot, hang, kill. This is defined in the conf file. Test sometimes fail
     # because the rebalance action is completed fast and there's no way to induce a failure.
     self.failover_actions[self.failover_action](self)
     try:
         rebalance_task.result()
     except Exception as err:
         self.log.info("Rebalance failed with : {0}".format(str(err)))
         if "Rebalance failed. See logs for detailed reason. You can try again" in str(
                 err):
             self.log.info(
                 "Rebalance failed even before auto-failover had a chance to stop it self.server_to_fail.ip: {0}"
                 .format(str(err)))
         elif not RestHelper(self.rest).is_cluster_rebalanced():
             if self._auto_failover_message_present_in_logs(
                     self.server_to_fail[0].ip):
                 self.log.info(
                     "Rebalance interrupted due to auto-failover of nodes - message was seen in logs"
                 )
             else:
                 self.fail(
                     "Rebalance interrupted message was not seen in logs")
         else:
             self.fail("Rebalance was not aborted by auto fail-over")
     self.disable_autofailover_and_validate()
Ejemplo n.º 34
0
 def _create_default_bucket(self):
     name = "default"
     master = self.master
     rest = RestConnection(master)
     helper = RestHelper(RestConnection(master))
     node_ram_ratio = BucketOperationHelper.base_bucket_ratio(TestInputSingleton.input.servers)
     info = rest.get_nodes_self()
     available_ram = info.memoryQuota * node_ram_ratio
     rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram))
     ready = BucketOperationHelper.wait_for_memcached(master, name)
     self.assertTrue(ready, msg="wait_for_memcached failed")
     self.assertTrue(helper.bucket_exists(name), msg="unable to create {0} bucket".format(name))
     self.load_thread = None
     self.shutdown_load_data = False
Ejemplo n.º 35
0
 def _create_default_bucket(self):
     name = "default"
     master = self.master
     rest = RestConnection(master)
     helper = RestHelper(RestConnection(master))
     if not helper.bucket_exists(name):
         node_ram_ratio = BucketOperationHelper.base_bucket_ratio(TestInputSingleton.input.servers)
         info = rest.get_nodes_self()
         available_ram = info.memoryQuota * node_ram_ratio
         rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram))
         ready = BucketOperationHelper.wait_for_memcached(master, name)
         self.assertTrue(ready, msg="wait_for_memcached failed")
     self.assertTrue(helper.bucket_exists(name),
                     msg="unable to create {0} bucket".format(name))
Ejemplo n.º 36
0
 def setUp(self):
     super(NewUpgradeBaseTest, self).setUp()
     self.product = self.input.param('product', 'couchbase-server')
     self.initial_version = self.input.param('initial_version',
                                             '1.8.1-942-rel')
     self.initial_vbuckets = self.input.param('initial_vbuckets', 64)
     self.rest_settings = self.input.membase_settings
     self.rest = RestConnection(self.master)
     self.rest_helper = RestHelper(self.rest)
     self.sleep_time = 10
     self.data_size = self.input.param('data_size', 1024)
     self.op_types = self.input.param('op_types', 'bucket')
     self.item_flag = self.input.param('item_flag', 4042322160)
     self.expire_time = self.input.param('expire_time', 0)
Ejemplo n.º 37
0
    def test_reset_count(self):
        timeout = self.timeout / 2
        server_fail1 = self.servers[1]
        server_fail2 = self.servers[2]
        status = self.rest.update_autoreprovision_settings(True, 2)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        self.log.info("stopping the first server")
        self._stop_couchbase(server_fail1)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)

        self.log.info("resetting the autoreprovision count")
        if not self.rest.reset_autoreprovision():
            self.fail('failed to reset autoreprovision count!')

        self.log.info("stopping the second server")
        self._stop_couchbase(server_fail2)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)

        settings = self.rest.get_autoreprovision_settings()
        self.assertEquals(settings.enabled, True)
        self.assertEquals(settings.max_nodes, 2)
        self.assertEquals(settings.count, 0)

        self._start_couchbase(server_fail2)
        self._start_couchbase(server_fail1)
        self.sleep(30)
        settings = self.rest.get_autoreprovision_settings()
        self.assertEquals(settings.enabled, True)
        self.assertEquals(settings.max_nodes, 2)
        self.assertEquals(settings.count, 2)
        self.log.info("resetting the autoreprovision count")
        if not self.rest.reset_autoreprovision():
            self.fail('failed to reset autoreprovision count!')
        settings = self.rest.get_autoreprovision_settings()
        self.assertEquals(settings.enabled, True)
        self.assertEquals(settings.max_nodes, 2)
        self.assertEquals(settings.count, 0)

        helper = RestHelper(self.rest)
        self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
        self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())
Ejemplo n.º 38
0
 def _create_default_bucket(self, unittest):
     name = "default"
     master = self.master
     rest = RestConnection(master)
     helper = RestHelper(RestConnection(master))
     if not helper.bucket_exists(name):
         node_ram_ratio = BucketOperationHelper.base_bucket_ratio(TestInputSingleton.input.servers)
         info = rest.get_nodes_self()
         available_ram = info.memoryQuota * node_ram_ratio
         rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram))
         ready = BucketOperationHelper.wait_for_memcached(master, name)
         BucketOperationHelper.wait_for_vbuckets_ready_state(master, name)
         unittest.assertTrue(ready, msg="wait_for_memcached failed")
     unittest.assertTrue(helper.bucket_exists(name),
                         msg="unable to create {0} bucket".format(name))
Ejemplo n.º 39
0
 def _create_default_bucket(self, replica=1):
     name = "default"
     master = self.servers[0]
     rest = RestConnection(master)
     helper = RestHelper(RestConnection(master))
     if not helper.bucket_exists(name):
         node_ram_ratio = BucketOperationHelper.base_bucket_ratio(self.servers)
         info = rest.get_nodes_self()
         available_ram = info.memoryQuota * node_ram_ratio
         rest.create_bucket(bucket=name, ramQuotaMB=int(available_ram), replicaNumber=replica,
                            storageBackend=self.bucket_storage)
         ready = BucketOperationHelper.wait_for_memcached(master, name)
         self.assertTrue(ready, msg="wait_for_memcached failed")
     self.assertTrue(helper.bucket_exists(name),
         msg="unable to create {0} bucket".format(name))
Ejemplo n.º 40
0
    def wait_for_ns_servers_or_assert(servers,
                                      testcase,
                                      wait_time=360,
                                      wait_if_warmup=False,
                                      debug=True):
        for server in servers:
            rest = RestConnection(server)
            log = logger.Logger.get_logger()
            if debug:
                log.info("waiting for ns_server @ {0}:{1}".format(
                    server.ip, server.port))
            if RestHelper(rest).is_ns_server_running(wait_time):
                if debug:
                    log.info("ns_server @ {0}:{1} is running".format(
                        server.ip, server.port))

            elif wait_if_warmup:
                # wait when warmup completed
                buckets = rest.get_buckets()
                for bucket in buckets:
                    testcase.assertTrue(ClusterOperationHelper._wait_warmup_completed(testcase,\
                                [server], bucket.name, wait_time), "warmup was not completed!")

            else:
                testcase.fail("ns_server {0} is not running in {1} sec".format(
                    server.ip, wait_time))
Ejemplo n.º 41
0
    def replication_verification(master, bucket, replica, inserted_count,
                                 test):
        rest = RestConnection(master)
        nodes = rest.node_statuses()

        if len(nodes) / (1 + replica) >= 1:
            final_replication_state = RestHelper(rest).wait_for_replication(
                900)
            msg = "replication state after waiting for up to 15 minutes : {0}"
            test.log.info(msg.format(final_replication_state))
            # in windows, we need to set timeout_in_seconds to 15+ minutes
            test.assertTrue(
                RebalanceHelper.wait_till_total_numbers_match(
                    master=master, bucket=bucket, timeout_in_seconds=1200),
                msg=
                "replication was completed but sum(curr_items) dont match the curr_items_total"
            )

            start_time = time.time()
            stats = rest.get_bucket_stats()
            while time.time() < (start_time + 120
                                 ) and stats["curr_items"] != inserted_count:
                test.log.info("curr_items : {0} versus {1}".format(
                    stats["curr_items"], inserted_count))
                time.sleep(5)
                stats = rest.get_bucket_stats()
            RebalanceHelper.print_taps_from_all_nodes(rest, bucket)
            test.log.info("curr_items : {0} versus {1}".format(
                stats["curr_items"], inserted_count))
            stats = rest.get_bucket_stats()
            msg = "curr_items : {0} is not equal to actual # of keys inserted : {1}"
            test.assertEquals(stats["curr_items"],
                              inserted_count,
                              msg=msg.format(stats["curr_items"],
                                             inserted_count))
Ejemplo n.º 42
0
 def add_node_and_rebalance(self, master, servers):
     ClusterOperationHelper.add_all_nodes_or_assert(
         master, servers, self.input.membase_settings, self)
     rest = RestConnection(master)
     nodes = rest.node_statuses()
     otpNodeIds = []
     for node in nodes:
         otpNodeIds.append(node.id)
     rebalanceStarted = rest.rebalance(otpNodeIds, [])
     self.assertTrue(
         rebalanceStarted,
         "unable to start rebalance on master node {0}".format(master.ip))
     self.log.info('started rebalance operation on master node {0}'.format(
         master.ip))
     rebalanceSucceeded = rest.monitorRebalance()
     self.assertTrue(
         rebalanceSucceeded,
         "rebalance operation for nodes: {0} was not successful".format(
             otpNodeIds))
     self.log.info(
         'rebalance operaton succeeded for nodes: {0}'.format(otpNodeIds))
     #now remove the nodes
     #make sure its rebalanced and node statuses are healthy
     helper = RestHelper(rest)
     self.assertTrue(helper.is_cluster_healthy,
                     "cluster status is not healthy")
     self.assertTrue(helper.is_cluster_rebalanced,
                     "cluster is not balanced")
Ejemplo n.º 43
0
 def _create_default_bucket(self):
     rest = RestConnection(self.master)
     helper = RestHelper(RestConnection(self.master))
     if not helper.bucket_exists(self.bucket):
         node_ram_ratio = BucketOperationHelper.base_bucket_ratio([self.master])
         info = rest.get_nodes_self()
         available_ram = info.memoryQuota * node_ram_ratio
         serverInfo = self.master
         rest.init_cluster(username=serverInfo.rest_username,
                           password=serverInfo.rest_password)
         rest.init_cluster_memoryQuota(memoryQuota=int(info.mcdMemoryReserved * node_ram_ratio))
         rest.create_bucket(bucket=self.bucket, ramQuotaMB=int(available_ram))
         ready = BucketOperationHelper.wait_for_memcached(self.master, self.bucket)
         self.assertTrue(ready, msg="wait_for_memcached failed")
     self.assertTrue(helper.bucket_exists(self.bucket),
                     msg="unable to create {0} bucket".format(self.bucket))
Ejemplo n.º 44
0
 def wait_for_rebalance_to_complete(self, task, wait_step=120):
     self.task.jython_task_manager.get_task_result(task)
     reached = RestHelper(self.rest).rebalance_reached(wait_step=wait_step)
     self.assertTrue(reached, "Rebalance failed, stuck or did not complete")
     self.assertTrue(task.result, "Rebalance Failed")
     if self.compaction:
         self.wait_for_compaction_to_complete()
Ejemplo n.º 45
0
 def test_node_memcached_failure(self):
     timeout = self.timeout / 2
     status = self.rest.update_autoreprovision_settings(True, 1)
     if not status:
         self.fail('failed to change autoreprovision_settings!')
     self.sleep(5)
     self._pause_couchbase(self.server_fail)
     self.sleep(5)
     AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1,
                                                       timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                       self)
     RemoteUtilHelper.common_basic_setup([self.server_fail])
     AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                         timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                         self)
     helper = RestHelper(self.rest)
     self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
     self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced")
     buckets = self.rest.get_buckets()
     for bucket in buckets:
         self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
Ejemplo n.º 46
0
 def setUp(self):
     super(NewUpgradeBaseTest, self).setUp()
     self.product = self.input.param('product', 'couchbase-server')
     self.initial_version = self.input.param('initial_version', '1.8.1-942-rel')
     self.initial_vbuckets = self.input.param('initial_vbuckets', 64)
     self.rest_settings = self.input.membase_settings
     self.rest = RestConnection(self.master)
     self.rest_helper = RestHelper(self.rest)
     self.sleep_time = 10
     self.data_size = self.input.param('data_size', 1024)
     self.op_types = self.input.param('op_types', 'bucket')
     self.item_flag = self.input.param('item_flag', 4042322160)
     self.expire_time = self.input.param('expire_time', 0)
Ejemplo n.º 47
0
    def test_crash_while_streaming(self):

        vbucket = 0
        nodeA = self.servers[0]
        n = 10000
        self.load_docs(nodeA, vbucket, n)

        dcp_client = self.dcp_client(nodeA, PRODUCER)
        stream = dcp_client.stream_req(vbucket, 0, 0, 2*n, 0)
        self.load_docs(nodeA, vbucket, n)
        assert self.stop_node(0)
        time.sleep(2)
        assert self.start_node(0)
        rest = RestHelper(RestConnection(nodeA))
        assert rest.is_ns_server_running()
        time.sleep(2)

        _, _, high_seqno = self.vb_info(nodeA, vbucket)
        dcp_client = self.dcp_client(nodeA, PRODUCER)
        stream = dcp_client.stream_req(vbucket, 0, 0, high_seqno, 0)
        stream.run()
        assert stream.last_by_seqno == high_seqno
Ejemplo n.º 48
0
 def terminate(self):
     if self._xdcr:
         self._terminate_replications(self._s_master, "cluster1")
         if self._rdirection == "bidirection":
             self._terminate_replications(self._d_master, "cluster0")
     for key in self._clusters_keys_olst:
         nodes = self._clusters_dic[key]
         for node in nodes:
             rest = RestConnection(node)
             buckets = rest.get_buckets()
             for bucket in buckets:
                 status = rest.delete_bucket(bucket.name)
                 if status:
                     self._log.info('Deleted bucket : {0} from {1}'.format(bucket.name, node.ip))
         rest = RestConnection(nodes[0])
         helper = RestHelper(rest)
         servers = rest.node_statuses()
         master_id = rest.get_nodes_self().id
         if len(nodes) > 1:
             removed = helper.remove_nodes(knownNodes=[node.id for node in servers],
                                       ejectedNodes=[node.id for node in servers if node.id != master_id],
                                       wait_for_rebalance=True   )
Ejemplo n.º 49
0
 def test_node_cb_restart(self):
     timeout = self.timeout / 2
     status = self.rest.update_autoreprovision_settings(True, 1)
     if not status:
         self.fail('failed to change autoreprovision_settings!')
     self.sleep(5)
     shell = RemoteMachineShellConnection(self.server_fail)
     shell.restart_couchbase()
     AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                         timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                         self)
     AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                         timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                         self)
     self.sleep(5)
     helper = RestHelper(self.rest)
     self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
     self.assertFalse(helper.is_cluster_rebalanced(), "cluster is not balanced")
     self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
     self.assertTrue(self.rest.monitorRebalance())
     buckets = self.rest.get_buckets()
     for bucket in buckets:
         self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
Ejemplo n.º 50
0
    def test_stream_after_n_crashes(self):

        crashes = 5
        vbucket = 0

        # load some data
        nodeA = self.servers[0]
        rest = RestHelper(RestConnection(nodeA))
        for i in xrange(crashes):
            self.load_docs(nodeA, vbucket, self.num_items)
            assert self.stop_node(0)
            time.sleep(5)
            assert self.start_node(0)
            assert rest.is_ns_server_running()
            time.sleep(2)

            vb_uuid, _, high_seqno = self.vb_info(nodeA, vbucket)
            dcp_client = self.dcp_client(nodeA, PRODUCER)
            stream = dcp_client.stream_req(
                vbucket, 0, 0,
                high_seqno, vb_uuid)
            stream.run()

            assert stream.last_by_seqno == high_seqno
Ejemplo n.º 51
0
 def verify_upgrade_rebalance_in_out(self):
     self.master = self.servers[self.initial_num_servers]
     self.rest = RestConnection(self.master)
     self.rest_helper = RestHelper(self.rest)
     for bucket in self.buckets:
         if self.rest_helper.bucket_exists(bucket.name):
             continue
         else:
             raise Exception("bucket:- %s not found" % bucket.name)
     if self.op_types == "bucket":
         bucketinfo = self.rest.get_bucket(bucket.name)
         self.log.info("bucket info :- %s" % bucketinfo)
     if self.op_types == "data":
         self._wait_for_stats_all_buckets(self.servers[self.initial_num_servers : self.num_servers])
         self._verify_all_buckets(self.master, 1, self.wait_timeout * 50, self.max_verify, True, 1)
         self._verify_stats_all_buckets(self.servers[self.initial_num_servers : self.num_servers])
Ejemplo n.º 52
0
 def test_node_memcached_failure_in_series(self):
     timeout = self.timeout / 2
     status = self.rest.update_autoreprovision_settings(True, 1)
     if not status:
         self.fail('failed to change autoreprovision_settings!')
     self.sleep(5)
     data_lost = False
     for i in reversed(xrange(len(self.servers))):
         print self.servers[i]
         operation = random.choice(['stop', 'memcached_failure', 'restart', 'failover', 'reboot'])
         shell = RemoteMachineShellConnection(self.servers[i])
         print "operation", operation
         if i == 0:
             self.master = self.servers[1]
         if operation == 'stop':
             self._stop_couchbase(self.servers[i])
         elif operation == 'memcached_failure':
             self._pause_couchbase(self.servers[i])
         elif operation == 'restart':
             shell.restart_couchbase()
         elif operation == 'failover':
             RemoteUtilHelper.enable_firewall(self.servers[i])
         elif operation == 'reboot':
             if shell.extract_remote_info().type.lower() == 'windows':
                 o, r = shell.execute_command("shutdown -r -f -t 0")
                 self.sleep(200)
             elif shell.extract_remote_info().type.lower() == 'linux':
                 o, r = shell.execute_command("reboot")
             shell.log_command_output(o, r)
             self.sleep(60)
         self.sleep(40)
         if operation == 'memcached_failure':
             AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1,
                                                               timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                               self)
         if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot':
             AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                                 timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                                 self)
         if operation != 'restart':
             RemoteUtilHelper.common_basic_setup([self.servers[i]])
         AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                             timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                             self)
         helper = RestHelper(RestConnection(self.master))
         self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
         self.sleep(40)
         if operation == 'memcached_failure' or operation == 'failover':
             self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced")
         else:
             if 'kv' in self.servers[i].services and self.replicas > 0:
                 self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
                 self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
                 self.assertTrue(self.rest.monitorRebalance())
             else:
                 self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced")
         buckets = self.rest.get_buckets()
         if self.replicas == 0 and (operation == 'restart' or operation == 'reboot'):
             data_lost = True
         for bucket in buckets:
             if not data_lost:
                 self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
Ejemplo n.º 53
0
    def test_ui_logs(self):
        timeout = self.timeout / 2
        server_fail1 = self.servers[1]
        server_fail2 = self.servers[2]
        status = self.rest.update_autoreprovision_settings(True, 2)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        logs = self.rest.get_logs(5)
        self.assertTrue(u'Enabled auto-reprovision config with max_nodes set to 2' in [l['text'] for l in logs])

        self.log.info("stopping the first server")
        self._stop_couchbase(server_fail1)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)

        self.log.info("resetting the autoreprovision count")
        if not self.rest.reset_autoreprovision():
            self.fail('failed to reset autoreprovision count!')
        logs = self.rest.get_logs(5)
        self.assertTrue(u'auto-reprovision count reset from 0' in [l['text'] for l in logs])

        self.log.info("stopping the second server")
        self._stop_couchbase(server_fail2)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        settings = self.rest.get_autoreprovision_settings()
        self.assertEquals(settings.enabled, True)
        self.assertEquals(settings.max_nodes, 2)
        self.assertEquals(settings.count, 0)
        self._start_couchbase(server_fail2)
        self._start_couchbase(server_fail1)
        self.sleep(30)
        settings = self.rest.get_autoreprovision_settings()
        self.assertEquals(settings.enabled, True)
        self.assertEquals(settings.max_nodes, 2)
        self.assertEquals(settings.count, 2)
        logs = self.rest.get_logs(5)
        self.assertTrue(u'auto-reprovision is disabled as maximum number of nodes (2) '
                        u'that can be auto-reprovisioned has been reached.' in [l['text'] for l in logs])

        self.log.info("resetting the autoreprovision count")
        if not self.rest.reset_autoreprovision():
            self.fail('failed to reset autoreprovision count!')
        settings = self.rest.get_autoreprovision_settings()
        self.assertEquals(settings.enabled, True)
        self.assertEquals(settings.max_nodes, 2)
        self.assertEquals(settings.count, 0)
        logs = self.rest.get_logs(5)
        self.assertTrue(u'auto-reprovision count reset from 2' in [l['text'] for l in logs])

        helper = RestHelper(self.rest)
        self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
        self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())
        logs = self.rest.get_logs(5)
        # https://issues.couchbase.com/browse/MB-24520
        self.assertFalse(u'Reset auto-failover count' in [l['text'] for l in logs])
        self.assertTrue(u'Rebalance completed successfully.' in [l['text'] for l in logs])
Ejemplo n.º 54
0
class MultiNodesUpgradeTests(NewUpgradeBaseTest):
    def setUp(self):
        super(MultiNodesUpgradeTests, self).setUp()
        if self.initial_version.startswith("1.6") or self.initial_version.startswith("1.7"):
            self.product = "membase-server"
        else:
            self.product = "couchbase-server"
        self.initial_num_servers = self.input.param("initial_num_servers", 2)

    def tearDown(self):
        super(MultiNodesUpgradeTests, self).tearDown()

    def offline_cluster_upgrade(self):
        self._install(self.servers[: self.initial_num_servers])
        self.operations(multi_nodes=True)
        upgrade_versions = self.input.param("upgrade_version", "2.0.0-1870-rel")
        upgrade_versions = upgrade_versions.split(";")
        self.log.info("Installation done going to sleep for %s sec", self.sleep_time)
        time.sleep(self.sleep_time)
        for upgrade_version in upgrade_versions:
            for server in self.servers[: self.initial_num_servers]:
                remote = RemoteMachineShellConnection(server)
                remote.stop_server()
                time.sleep(self.sleep_time)
                remote.disconnect()
            for server in self.servers[: self.initial_num_servers]:
                remote = RemoteMachineShellConnection(server)
                self._upgrade(upgrade_version, server, remote)
                time.sleep(self.sleep_time)
                remote.disconnect()
            time.sleep(self.expire_time)
            self.num_servers = self.initial_num_servers
            self.verification(multi_nodes=True)

    def online_upgrade_rebalance_in_out(self):
        self._install(self.servers[: self.initial_num_servers])
        self.operations(multi_nodes=True)
        self.log.info("Installation of old version is done. Wait for %s sec for upgrade" % (self.sleep_time))
        time.sleep(self.sleep_time)
        upgrade_version = self.input.param("upgrade_version", "2.0.0-1870-rel")
        self.initial_version = upgrade_version
        self.product = "couchbase-server"
        self._install(self.servers[self.initial_num_servers : self.num_servers])
        self.log.info("Installation of new version is done. Wait for %s sec for rebalance" % (self.sleep_time))
        time.sleep(self.sleep_time)

        servers_in = self.servers[self.initial_num_servers : self.num_servers]
        self.cluster.rebalance(self.servers[: self.initial_num_servers], servers_in, [])
        self.log.info("Rebalance in all 2.0 Nodes")
        time.sleep(self.sleep_time)
        status, content = ClusterHelper.find_orchestrator(self.master)
        self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format(status, content))
        FIND_MASTER = False
        for new_server in servers_in:
            if content.find(new_server.ip) >= 0:
                FIND_MASTER = True
                self.log.info("2.0 Node %s becomes the master" % (new_server.ip))
        if not FIND_MASTER:
            raise Exception("After rebalance in 2.0 Nodes, 2.0 doesn't become the master")

        servers_out = self.servers[: self.initial_num_servers]
        self.cluster.rebalance(self.servers[: self.num_servers], [], servers_out)
        self.log.info("Rebalance out all old version nodes")
        time.sleep(self.sleep_time)
        self.verify_upgrade_rebalance_in_out()

    def verify_upgrade_rebalance_in_out(self):
        self.master = self.servers[self.initial_num_servers]
        self.rest = RestConnection(self.master)
        self.rest_helper = RestHelper(self.rest)
        for bucket in self.buckets:
            if self.rest_helper.bucket_exists(bucket.name):
                continue
            else:
                raise Exception("bucket:- %s not found" % bucket.name)
        if self.op_types == "bucket":
            bucketinfo = self.rest.get_bucket(bucket.name)
            self.log.info("bucket info :- %s" % bucketinfo)
        if self.op_types == "data":
            self._wait_for_stats_all_buckets(self.servers[self.initial_num_servers : self.num_servers])
            self._verify_all_buckets(self.master, 1, self.wait_timeout * 50, self.max_verify, True, 1)
            self._verify_stats_all_buckets(self.servers[self.initial_num_servers : self.num_servers])

    def online_upgrade_swap_rebalance(self):
        self._install(self.servers[: self.initial_num_servers])
        self.operations(multi_nodes=True)
        self.log.info("Installation of old version is done. Wait for %s sec for upgrade" % (self.sleep_time))
        time.sleep(self.sleep_time)
        upgrade_version = self.input.param("upgrade_version", "2.0.0-1870-rel")
        self.initial_version = upgrade_version
        self.product = "couchbase-server"
        self._install(self.servers[self.initial_num_servers : self.num_servers])
        self.log.info("Installation of new version is done. Wait for %s sec for rebalance" % (self.sleep_time))
        time.sleep(self.sleep_time)

        self.swap_num_servers = self.input.param("swap_num_servers", 1)
        old_servers = self.servers[: self.initial_num_servers]
        new_servers = []
        for i in range(self.initial_num_servers / self.swap_num_servers):
            servers_in = self.servers[
                (self.initial_num_servers + i * self.swap_num_servers) : (
                    self.initial_num_servers + (i + 1) * self.swap_num_servers
                )
            ]
            servers_out = self.servers[(i * self.swap_num_servers) : ((i + 1) * self.swap_num_servers)]
            servers = old_servers + new_servers
            self.cluster.rebalance(servers, servers_in, servers_out)
            self.log.info(
                "Swap rebalance: rebalance out %s old version nodes, rebalance in %s 2.0 Nodes"
                % (self.swap_num_servers, self.swap_num_servers)
            )
            time.sleep(self.sleep_time)
            old_servers = self.servers[((i + 1) * self.swap_num_servers) : self.initial_num_servers]
            new_servers = new_servers + servers_in
            servers = old_servers + new_servers
            status, content = ClusterHelper.find_orchestrator(servers[0])
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".format(status, content))
            FIND_MASTER = False
            for new_server in new_servers:
                if content.find(new_server.ip) >= 0:
                    FIND_MASTER = True
                    self.log.info("2.0 Node %s becomes the master" % (new_server.ip))
            if not FIND_MASTER:
                raise Exception("After rebalance in 2.0 nodes, 2.0 doesn't become the master ")

        self.verify_upgrade_rebalance_in_out()
Ejemplo n.º 55
0
class NewUpgradeBaseTest(BaseTestCase):

    def setUp(self):
        super(NewUpgradeBaseTest, self).setUp()
        self.product = self.input.param('product', 'couchbase-server')
        self.initial_version = self.input.param('initial_version', '1.8.1-942-rel')
        self.initial_vbuckets = self.input.param('initial_vbuckets', 64)
        self.rest_settings = self.input.membase_settings
        self.rest = RestConnection(self.master)
        self.rest_helper = RestHelper(self.rest)
        self.sleep_time = 10
        self.data_size = self.input.param('data_size', 1024)
        self.op_types = self.input.param('op_types', 'bucket')
        self.item_flag = self.input.param('item_flag', 4042322160)
        self.expire_time = self.input.param('expire_time', 0)

    def tearDown(self):
        super(NewUpgradeBaseTest, self).tearDown()

    def _install(self, servers):
        params = {}
        params['num_nodes'] = len(servers)
        params['product'] = self.product
        params['version'] = self.initial_version
        params['vbuckets'] = [self.initial_vbuckets]
        InstallerJob().parallel_install(servers, params)
        if self.product in ["couchbase", "couchbase-server", "cb"]:
            success = True
            for server in servers:
                success &= RemoteMachineShellConnection(server).is_couchbase_installed()
                if not success:
                    self.log.info("some nodes were not install successfully!")
                    sys.exit(1)

    def operations(self, multi_nodes=False):
        self.quota = self._initialize_nodes(self.cluster, self.servers, self.disabled_consistent_view)
        self.buckets = []
        gc.collect()
        if self.total_buckets > 0:
            self.bucket_size = self._get_bucket_size(self.quota, self.total_buckets)

        if self.default_bucket:
            self.cluster.create_default_bucket(self.master, self.bucket_size, self.num_replicas)
            self.buckets.append(Bucket(name="default", authType="sasl", saslPassword="",
                                       num_replicas=self.num_replicas, bucket_size=self.bucket_size))

        self._create_sasl_buckets(self.master, self.sasl_buckets)
        self._create_standard_buckets(self.master, self.standard_buckets)
        if multi_nodes:
            servers_in = [self.servers[i+1] for i in range(self.initial_num_servers-1)]
            self.cluster.rebalance(self.servers[:1], servers_in, [])
        if self.op_types == "data":
            self._load_data_all_buckets("create")
            if multi_nodes:
                self._wait_for_stats_all_buckets(self.servers[:self.initial_num_servers])
            else:
                self._wait_for_stats_all_buckets([self.master])

    def _load_data_all_buckets(self, op_type='create', start=0):
        loaded = False
        count = 0
        gen_load = BlobGenerator('upgrade-', 'upgrade-', self.data_size, start=start, end=self.num_items)
        while not loaded and count < 60:
            try :
                self._load_all_buckets(self.master, gen_load, op_type, self.expire_time, 1,
                                       self.item_flag, True, batch_size=20000, pause_secs=5, timeout_secs=180)
                loaded = True
            except MemcachedError as error:
                if error.status == 134:
                    loaded = False
                    self.log.error("Memcached error 134, wait for 5 seconds and then try again")
                    count += 1
                    time.sleep(self.sleep_time)

    def _get_build(self, server, version, remote, is_amazon=False):
        info = remote.extract_remote_info()
        builds, changes = BuildQuery().get_all_builds()
        self.log.info("finding build %s for machine %s" % (version, server))
        result = re.search('r', version)

        if result is None:
            appropriate_build = BuildQuery().\
                find_membase_release_build('%s-enterprise' % (self.product), info.deliverable_type,
                                           info.architecture_type, version.strip(), is_amazon=is_amazon)
        else:
            appropriate_build = BuildQuery().\
                find_membase_build(builds, '%s-enterprise' % (self.product), info.deliverable_type,
                                   info.architecture_type, version.strip(), is_amazon=is_amazon)

        return appropriate_build

    def _upgrade(self, upgrade_version, server, remote):
        appropriate_build = self._get_build(server, upgrade_version, remote)
        self.assertTrue(appropriate_build.url, msg="unable to find build {0}".format(upgrade_version))
        remote.download_build(appropriate_build)
        remote.membase_upgrade(appropriate_build, save_upgrade_config=False)
        self.rest_helper.is_ns_server_running(testconstants.NS_SERVER_TIMEOUT)
        self.rest.init_cluster_port(self.rest_settings.rest_username, self.rest_settings.rest_password)
        time.sleep(self.sleep_time)

    def verification(self, multi_nodes=False):
        for bucket in self.buckets:
            if self.rest_helper.bucket_exists(bucket.name):
                continue
            else:
                raise Exception("bucket:- %s not found" % bucket.name)
            if self.op_types == "bucket":
                bucketinfo = self.rest.get_bucket(bucket.name)
                self.log.info("bucket info :- %s" % bucketinfo)

        if self.op_types == "data":
            if multi_nodes:
                self._wait_for_stats_all_buckets(self.servers[:self.num_servers])
                self._verify_all_buckets(self.master, 1, self.wait_timeout*50, self.max_verify, True, 1)
                self._verify_stats_all_buckets(self.servers[:self.num_servers])
            else:
                self._wait_for_stats_all_buckets([self.master])
                self._verify_all_buckets(self.master, 1, self.wait_timeout*50, self.max_verify, True, 1)
                self._verify_stats_all_buckets([self.master])
 def backup_restore(self):
     try:
         backup_start = self.backups[int(self.backupset.start) - 1]
     except IndexError:
         backup_start = "{0}{1}".format(self.backups[-1], self.backupset.start)
     try:
         backup_end = self.backups[int(self.backupset.end) - 1]
     except IndexError:
         backup_end = "{0}{1}".format(self.backups[-1], self.backupset.end)
     args = (
         "restore --archive {0} --repo {1} {2} http://{3}:{4} --username {5} "
         "--password {6} --start {7} --end {8}".format(
             self.backupset.directory,
             self.backupset.name,
             self.cluster_flag,
             self.backupset.restore_cluster_host.ip,
             self.backupset.restore_cluster_host.port,
             self.backupset.restore_cluster_host_username,
             self.backupset.restore_cluster_host_password,
             backup_start,
             backup_end,
         )
     )
     if self.backupset.exclude_buckets:
         args += " --exclude-buckets {0}".format(self.backupset.exclude_buckets)
     if self.backupset.include_buckets:
         args += " --include-buckets {0}".format(self.backupset.include_buckets)
     if self.backupset.disable_bucket_config:
         args += " --disable-bucket-config {0}".format(self.backupset.disable_bucket_config)
     if self.backupset.disable_views:
         args += " --disable-views {0}".format(self.backupset.disable_views)
     if self.backupset.disable_gsi_indexes:
         args += " --disable-gsi-indexes {0}".format(self.backupset.disable_gsi_indexes)
     if self.backupset.disable_ft_indexes:
         args += " --disable-ft-indexes {0}".format(self.backupset.disable_ft_indexes)
     if self.backupset.disable_data:
         args += " --disable-data {0}".format(self.backupset.disable_data)
     if self.backupset.disable_conf_res_restriction is not None:
         args += " --disable-conf-res-restriction {0}".format(self.backupset.disable_conf_res_restriction)
     if self.backupset.filter_keys:
         args += " --filter_keys {0}".format(self.backupset.filter_keys)
     if self.backupset.filter_values:
         args += " --filter_values {0}".format(self.backupset.filter_values)
     if self.backupset.force_updates:
         args += " --force-updates"
     if self.no_progress_bar:
         args += " --no-progress-bar"
     if not self.skip_buckets:
         rest_conn = RestConnection(self.backupset.restore_cluster_host)
         rest_helper = RestHelper(rest_conn)
         for bucket in self.buckets:
             if not rest_helper.bucket_exists(bucket.name):
                 self.log.info(
                     "Creating bucket {0} in restore host {1}".format(
                         bucket.name, self.backupset.restore_cluster_host.ip
                     )
                 )
                 rest_conn.create_bucket(
                     bucket=bucket.name,
                     ramQuotaMB=512,
                     authType=bucket.authType if bucket.authType else "none",
                     proxyPort=bucket.port,
                     saslPassword=bucket.saslPassword,
                     lww=self.lww_new,
                 )
                 bucket_ready = rest_helper.vbucket_map_ready(bucket.name)
                 if not bucket_ready:
                     self.fail("Bucket %s not created after 120 seconds." % bucket.name)
     remote_client = RemoteMachineShellConnection(self.backupset.backup_host)
     command = "{0}/cbbackupmgr {1}".format(self.cli_command_location, args)
     output, error = remote_client.execute_command(command)
     remote_client.log_command_output(output, error)
     res = output
     res.extend(error)
     error_str = "Error restoring cluster: Transfer failed. Check the logs for more information."
     if error_str in res:
         command = "cat " + self.backupset.directory + "/logs/backup.log | grep '" + error_str + "' -A 10 -B 100"
         output, error = remote_client.execute_command(command)
         remote_client.log_command_output(output, error)
     if "Required Flags:" in res:
         self.fail("Command line failed. Please check test params.")
     return output, error
Ejemplo n.º 57
0
    def test_backup_upgrade_restore_default(self):
        if len(self.servers) < 2:
            self.log.error("At least 2 servers required for this test ..")
            return
        original_set = copy.copy(self.servers)
        worker = self.servers[len(self.servers) - 1]
        self.servers = self.servers[:len(self.servers)-1]
        shell = RemoteMachineShellConnection(self.master)
        o, r = shell.execute_command("cat /opt/couchbase/VERSION.txt")
        fin = o[0]
        shell.disconnect()
        initial_version = self.input.param("initial_version", fin)
        final_version = self.input.param("final_version", fin)
        if initial_version==final_version:
            self.log.error("Same initial and final versions ..")
            return
        if not final_version.startswith('2.0'):
            self.log.error("Upgrade test not set to run from 1.8.1 -> 2.0 ..")
            return
        builds, changes = BuildQuery().get_all_builds(version=final_version)
        product = 'couchbase-server-enterprise'
        #CASE where the worker isn't a 2.0+
        worker_flag = 0
        shell = RemoteMachineShellConnection(worker)
        o, r = shell.execute_command("cat /opt/couchbase/VERSION.txt")
        temp = o[0]
        if not temp.startswith('2.0'):
            worker_flag = 1
        if worker_flag == 1:
            self.log.info("Loading version {0} on worker.. ".format(final_version))
            remote = RemoteMachineShellConnection(worker)
            info = remote.extract_remote_info()
            older_build = BuildQuery().find_build(builds, product, info.deliverable_type,
                                                  info.architecture_type, final_version)
            remote.stop_couchbase()
            remote.couchbase_uninstall()
            remote.download_build(older_build)
            remote.install_server(older_build)
            remote.disconnect()

        remote_tmp = "{1}/{0}".format("backup", "/root")
        perm_comm = "mkdir -p {0}".format(remote_tmp)
        if not initial_version == fin:
            for server in self.servers:
                remote = RemoteMachineShellConnection(server)
                info = remote.extract_remote_info()
                self.log.info("Loading version ..  {0}".format(initial_version))
                older_build = BuildQuery().find_build(builds, product, info.deliverable_type,
                                                      info.architecture_type, initial_version)
                remote.stop_couchbase()
                remote.couchbase_uninstall()
                remote.download_build(older_build)
                remote.install_server(older_build)
                rest = RestConnection(server)
                RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT)
                rest.init_cluster(server.rest_username, server.rest_password)
                rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved)
                remote.disconnect()

        self.common_setUp()
        bucket = "default"
        if len(self.servers) > 1:
            self.add_nodes_and_rebalance()
        rest = RestConnection(self.master)
        info = rest.get_nodes_self()
        size = int(info.memoryQuota * 2.0 / 3.0)
        rest.create_bucket(bucket, ramQuotaMB=size)
        ready = BucketOperationHelper.wait_for_memcached(self.master, bucket)
        self.assertTrue(ready, "wait_for_memcached_failed")
        distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05}
        inserted_keys, rejected_keys = MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.master],
                                                                                             name=bucket,
                                                                                             ram_load_ratio=0.5,
                                                                                             value_size_distribution=distribution,
                                                                                             moxi=True,
                                                                                             write_only=True,
                                                                                             delete_ratio=0.1,
                                                                                             number_of_threads=2)
        if len(self.servers) > 1:
            rest = RestConnection(self.master)
            self.assertTrue(RebalanceHelper.wait_for_replication(rest.get_nodes(), timeout=180),
                            msg="replication did not complete")

        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        node = RestConnection(self.master).get_nodes_self()
        shell = RemoteMachineShellConnection(worker)
        o, r = shell.execute_command(perm_comm)
        shell.log_command_output(o, r)
        shell.disconnect()

        #Backup
        #BackupHelper(self.master, self).backup(bucket, node, remote_tmp)
        shell = RemoteMachineShellConnection(worker)
        shell.execute_command("/opt/couchbase/bin/cbbackup http://{0}:{1} {2}".format(
                                                            self.master.ip, self.master.port, remote_tmp))
        shell.disconnect()
        BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self)
        time.sleep(30)

        #Upgrade
        for server in self.servers:
            self.log.info("Upgrading to current version {0}".format(final_version))
            remote = RemoteMachineShellConnection(server)
            info = remote.extract_remote_info()
            new_build = BuildQuery().find_build(builds, product, info.deliverable_type,
                                                info.architecture_type, final_version)
            remote.stop_couchbase()
            remote.couchbase_uninstall()
            remote.download_build(new_build)
            remote.install_server(new_build)
            rest = RestConnection(server)
            RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT)
            rest.init_cluster(server.rest_username, server.rest_password)
            rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved)
            remote.disconnect()
        time.sleep(30)

        #Restore
        rest = RestConnection(self.master)
        info = rest.get_nodes_self()
        size = int(info.memoryQuota * 2.0 / 3.0)
        rest.create_bucket(bucket, ramQuotaMB=size)
        ready = BucketOperationHelper.wait_for_memcached(server, bucket)
        self.assertTrue(ready, "wait_for_memcached_failed")
        #BackupHelper(self.master, self).restore(backup_location=remote_tmp, moxi_port=info.moxi)
        shell = RemoteMachineShellConnection(worker)
        shell.execute_command("/opt/couchbase/bin/cbrestore {2} http://{0}:{1} -b {3}".format(
                                                            self.master.ip, self.master.port, remote_tmp, bucket))
        shell.disconnect()
        time.sleep(60)
        keys_exist = BucketOperationHelper.keys_exist_or_assert_in_parallel(inserted_keys, self.master, bucket, self, concurrency=4)
        self.assertTrue(keys_exist, msg="unable to verify keys after restore")
        time.sleep(30)
        BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self)
        rest = RestConnection(self.master)
        helper = RestHelper(rest)
        nodes = rest.node_statuses()
        master_id = rest.get_nodes_self().id
        if len(self.servers) > 1:
                removed = helper.remove_nodes(knownNodes=[node.id for node in nodes],
                                          ejectedNodes=[node.id for node in nodes if node.id != master_id],
                                          wait_for_rebalance=True   )

        shell = RemoteMachineShellConnection(worker)
        shell.remove_directory(remote_tmp)
        shell.disconnect()

        self.servers = copy.copy(original_set)
        if initial_version == fin:
            builds, changes = BuildQuery().get_all_builds(version=initial_version)
            for server in self.servers:
                remote = RemoteMachineShellConnection(server)
                info = remote.extract_remote_info()
                self.log.info("Loading version ..  {0}".format(initial_version))
                older_build = BuildQuery().find_build(builds, product, info.deliverable_type,
                                                      info.architecture_type, initial_version)
                remote.stop_couchbase()
                remote.couchbase_uninstall()
                remote.download_build(older_build)
                remote.install_server(older_build)
                rest = RestConnection(server)
                RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT)
                rest.init_cluster(server.rest_username, server.rest_password)
                rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved)
                remote.disconnect()
Ejemplo n.º 58
0
class PerfBase(unittest.TestCase):

    """
    specURL = http://hub.internal.couchbase.org/confluence/display/cbit/Black+Box+Performance+Test+Matrix

    """

    # The setUpBaseX() methods allow subclasses to resequence the setUp() and
    # skip cluster configuration.
    def setUpBase0(self):
        self.log = logger.Logger.get_logger()
        self.input = TestInputSingleton.input
        self.vbucket_count = PerfDefaults.vbuckets
        self.sc = None
        if self.parami("tear_down_on_setup",
                       PerfDefaults.tear_down_on_setup) == 1:
            self.tearDown()  # Tear down in case previous run had unclean death
        master = self.input.servers[0]
        self.set_up_rest(master)

    def setUpBase1(self):
        if self.parami('num_buckets', 1) > 1:
            bucket = 'bucket-0'
        else:
            bucket = self.param('bucket', 'default')
        vBuckets = self.rest.get_vbuckets(bucket)
        self.vbucket_count = len(vBuckets)

    def setUp(self):
        self.setUpBase0()

        master = self.input.servers[0]

        self.is_multi_node = False
        self.data_path = master.data_path

        # Number of items loaded by load() method.
        # Does not include or count any items that came from set_up_dgm().
        #
        self.num_items_loaded = 0

        if self.input.clusters:
            for cluster in self.input.clusters.values():
                master = cluster[0]
                self.set_up_rest(master)
                self.set_up_cluster(master)
        else:
            master = self.input.servers[0]
            self.set_up_cluster(master)

        # Rebalance
        num_nodes = self.parami("num_nodes", 10)
        self.rebalance_nodes(num_nodes)

        if self.input.clusters:
            for cluster in self.input.clusters.values():
                master = cluster[0]
                self.set_up_rest(master)
                self.set_up_buckets()
        else:
            self.set_up_buckets()

        self.set_up_proxy()

        if self.input.clusters:
            for cluster in self.input.clusters.values():
                master = cluster[0]
                self.set_up_rest(master)
                self.reconfigure()
        else:
            self.reconfigure()

        if self.parami("dgm", getattr(self, "dgm", 1)):
            self.set_up_dgm()

        time.sleep(10)
        self.setUpBase1()

        if self.input.clusters:
            for cluster in self.input.clusters.values():
                self.wait_until_warmed_up(cluster[0])
        else:
            self.wait_until_warmed_up()
        ClusterOperationHelper.flush_os_caches(self.input.servers)

    def set_up_rest(self, master):
        self.rest = RestConnection(master)
        self.rest_helper = RestHelper(self.rest)

    def set_up_cluster(self, master):
        """Initialize cluster"""

        print "[perf.setUp] Setting up cluster"

        self.rest.init_cluster(master.rest_username, master.rest_password)

        memory_quota = self.parami('mem_quota', PerfDefaults.mem_quota)
        self.rest.init_cluster_memoryQuota(master.rest_username,
                                           master.rest_password,
                                           memoryQuota=memory_quota)

    def set_up_buckets(self):
        """Set up data bucket(s)"""

        print "[perf.setUp] Setting up buckets"

        num_buckets = self.parami('num_buckets', 1)
        if num_buckets > 1:
            self.buckets = ['bucket-{0}'.format(i) for i in range(num_buckets)]
        else:
            self.buckets = [self.param('bucket', 'default')]

        for bucket in self.buckets:
            bucket_ram_quota = self.parami('mem_quota', PerfDefaults.mem_quota)
            bucket_ram_quota = bucket_ram_quota / num_buckets
            replicas = self.parami('replicas', getattr(self, 'replicas', 1))

            self.rest.create_bucket(bucket=bucket, ramQuotaMB=bucket_ram_quota,
                                    replicaNumber=replicas, authType='sasl')

            status = self.rest_helper.vbucket_map_ready(bucket, 60)
            self.assertTrue(status, msg='vbucket_map not ready .. timed out')
            status = self.rest_helper.bucket_exists(bucket)
            self.assertTrue(status,
                            msg='unable to create {0} bucket'.format(bucket))

    def reconfigure(self):
        """Customize basic Couchbase setup"""

        print "[perf.setUp] Customizing setup"

        self.set_loglevel()
        self.set_max_concurrent_reps_per_doc()
        self.set_autocompaction()

    def set_loglevel(self):
        """Set custom loglevel"""

        loglevel = self.param('loglevel', None)
        if loglevel:
            self.rest.set_global_loglevel(loglevel)

    def set_max_concurrent_reps_per_doc(self):
        """Set custom MAX_CONCURRENT_REPS_PER_DOC"""

        max_concurrent_reps_per_doc = self.param('max_concurrent_reps_per_doc',
                                                 None)
        if max_concurrent_reps_per_doc:
            for server in self.input.servers:
                rc = RemoteMachineShellConnection(server)
                rc.set_environment_variable('MAX_CONCURRENT_REPS_PER_DOC',
                                            max_concurrent_reps_per_doc)

    def set_ep_compaction(self, comp_ratio):
        """Set up ep_engine side compaction ratio"""
        for server in self.input.servers:
            shell = RemoteMachineShellConnection(server)
            cmd = "/opt/couchbase/bin/cbepctl localhost:11210 "\
                  "set flush_param db_frag_threshold {0}".format(comp_ratio)
            self._exec_and_log(shell, cmd)
            shell.disconnect()

    def set_autocompaction(self, disable_view_compaction=False):
        """Set custom auto-compaction settings"""

        try:
            # Parallel database and view compaction
            parallel_compaction = self.param("parallel_compaction",
                                             PerfDefaults.parallel_compaction)
            # Database fragmentation threshold
            db_compaction = self.parami("db_compaction",
                                        PerfDefaults.db_compaction)
            print "[perf.setUp] database compaction = %d" % db_compaction

            # ep_engine fragementation threshold
            ep_compaction = self.parami("ep_compaction",
                                        PerfDefaults.ep_compaction)
            self.set_ep_compaction(ep_compaction)
            print "[perf.setUp] ep_engine compaction = %d" % ep_compaction

            # View fragmentation threshold
            if disable_view_compaction:
                view_compaction = 100
            else:
                view_compaction = self.parami("view_compaction",
                                              PerfDefaults.view_compaction)
            # Set custom auto-compaction settings
            self.rest.set_auto_compaction(parallelDBAndVC=parallel_compaction,
                                          dbFragmentThresholdPercentage=db_compaction,
                                          viewFragmntThresholdPercentage=view_compaction)
        except Exception as e:
            # It's very hard to determine what exception it can raise.
            # Therefore we have to use general handler.
            print "ERROR while changing compaction settings: {0}".format(e)

    def tearDown(self):
        if self.parami("tear_down", 0) == 1:
            print "[perf.tearDown] tearDown routine skipped"
            return

        print "[perf.tearDown] tearDown routine starts"

        if self.parami("tear_down_proxy", 1) == 1:
            self.tear_down_proxy()
        else:
            print "[perf.tearDown] Proxy tearDown skipped"

        if self.sc is not None:
            self.sc.stop()
            self.sc = None

        if self.parami("tear_down_bucket", 0) == 1:
            self.tear_down_buckets()
        else:
            print "[perf.tearDown] Bucket tearDown skipped"

        if self.parami("tear_down_cluster", 1) == 1:
            self.tear_down_cluster()
        else:
            print "[perf.tearDown] Cluster tearDown skipped"

        print "[perf.tearDown] tearDown routine finished"

    def tear_down_buckets(self):
        print "[perf.tearDown] Tearing down bucket"
        BucketOperationHelper.delete_all_buckets_or_assert(self.input.servers,
                                                           self)
        print "[perf.tearDown] Bucket teared down"

    def tear_down_cluster(self):
        print "[perf.tearDown] Tearing down cluster"
        ClusterOperationHelper.cleanup_cluster(self.input.servers)
        ClusterOperationHelper.wait_for_ns_servers_or_assert(self.input.servers,
                                                             self)
        print "[perf.tearDown] Cluster teared down"

    def set_up_proxy(self, bucket=None):
        """Set up and start Moxi"""

        if self.input.moxis:
            print '[perf.setUp] Setting up proxy'

            bucket = bucket or self.param('bucket', 'default')

            shell = RemoteMachineShellConnection(self.input.moxis[0])
            shell.start_moxi(self.input.servers[0].ip, bucket,
                             self.input.moxis[0].port)
            shell.disconnect()

    def tear_down_proxy(self):
        if len(self.input.moxis) > 0:
            shell = RemoteMachineShellConnection(self.input.moxis[0])
            shell.stop_moxi()
            shell.disconnect()

    # Returns "host:port" of moxi to hit.
    def target_host_port(self, bucket='default', use_direct=False):
        rv = self.param('moxi', None)
        if use_direct:
            return "%s:%s" % (self.input.servers[0].ip,
                              '11210')
        if rv:
            return rv
        if len(self.input.moxis) > 0:
            return "%s:%s" % (self.input.moxis[0].ip,
                              self.input.moxis[0].port)
        return "%s:%s" % (self.input.servers[0].ip,
                          self.rest.get_bucket(bucket).nodes[0].moxi)

    def protocol_parse(self, protocol_in, use_direct=False):
        if protocol_in.find('://') >= 0:
            if protocol_in.find("couchbase:") >= 0:
                protocol = "couchbase"
            else:
                protocol = \
                    '-'.join(((["membase"] +
                    protocol_in.split("://"))[-2] + "-binary").split('-')[0:2])
            host_port = ('@' + protocol_in.split("://")[-1]).split('@')[-1]
            user, pswd = (('@' +
                           protocol_in.split("://")[-1]).split('@')[-2] +
                           ":").split(':')[0:2]
        else:
            protocol = 'memcached-' + protocol_in
            host_port = self.target_host_port(use_direct=use_direct)
            user = self.param("rest_username", "Administrator")
            pswd = self.param("rest_password", "password")
        return protocol, host_port, user, pswd

    def mk_protocol(self, host, port='8091', prefix='membase-binary'):
        return self.param('protocol',
                          prefix + '://' + host + ':' + port)

    def restartProxy(self, bucket=None):
        self.tear_down_proxy()
        self.set_up_proxy(bucket)

    def set_up_dgm(self):
        """Download fragmented, DGM dataset onto each cluster node, if not
        already locally available.

        The number of vbuckets and database schema must match the
        target cluster.

        Shutdown all cluster nodes.

        Do a cluster-restore.

        Restart all cluster nodes."""

        bucket = self.param("bucket", "default")
        ClusterOperationHelper.stop_cluster(self.input.servers)
        for server in self.input.servers:
            remote = RemoteMachineShellConnection(server)
            #TODO: Better way to pass num_nodes and db_size?
            self.get_data_files(remote, bucket, 1, 10)
            remote.disconnect()
        ClusterOperationHelper.start_cluster(self.input.servers)

    def get_data_files(self, remote, bucket, num_nodes, db_size):
        base = 'https://s3.amazonaws.com/database-analysis'
        dir = '/tmp/'
        if remote.is_couchbase_installed():
            dir = dir + '/couchbase/{0}-{1}-{2}/'.format(num_nodes, 256,
                                                         db_size)
            output, error = remote.execute_command('mkdir -p {0}'.format(dir))
            remote.log_command_output(output, error)
            file = '{0}_cb.tar.gz'.format(bucket)
            base_url = base + '/couchbase/{0}-{1}-{2}/{3}'.format(num_nodes,
                                                                  256, db_size,
                                                                  file)
        else:
            dir = dir + '/membase/{0}-{1}-{2}/'.format(num_nodes, 1024,
                                                       db_size)
            output, error = remote.execute_command('mkdir -p {0}'.format(dir))
            remote.log_command_output(output, error)
            file = '{0}_mb.tar.gz'.format(bucket)
            base_url = base + '/membase/{0}-{1}-{2}/{3}'.format(num_nodes,
                                                                1024, db_size,
                                                                file)


        info = remote.extract_remote_info()
        wget_command = 'wget'
        if info.type.lower() == 'windows':
            wget_command = \
                "cd {0} ;cmd /c 'c:\\automation\\wget.exe --no-check-certificate"\
                .format(dir)

        # Check if the file exists on the remote server else download the gzipped version
        # Extract if necessary
        exist = remote.file_exists(dir, file)
        if not exist:
            additional_quote = ""
            if info.type.lower() == 'windows':
                additional_quote = "'"
            command = "{0} -v -O {1}{2} {3} {4} ".format(wget_command, dir,
                                                         file, base_url,
                                                         additional_quote)
            output, error = remote.execute_command(command)
            remote.log_command_output(output, error)

        if remote.is_couchbase_installed():
            if info.type.lower() == 'windows':
                destination_folder = testconstants.WIN_COUCHBASE_DATA_PATH
            else:
                destination_folder = testconstants.COUCHBASE_DATA_PATH
        else:
            if info.type.lower() == 'windows':
                destination_folder = testconstants.WIN_MEMBASE_DATA_PATH
            else:
                destination_folder = testconstants.MEMBASE_DATA_PATH
        if self.data_path:
            destination_folder = self.data_path
        untar_command = 'cd {1}; tar -xzf {0}'.format(dir + file,
                                                      destination_folder)
        output, error = remote.execute_command(untar_command)
        remote.log_command_output(output, error)

    def _exec_and_log(self, shell, cmd):
        """helper method to execute a command and log output"""
        if not cmd or not shell:
            return

        output, error = shell.execute_command(cmd)
        shell.log_command_output(output, error)

    def _build_tar_name(self, bucket, version="unknown_version",
                        file_base=None):
        """build tar file name.

        {file_base}-{version}-{bucket}.tar.gz
        """
        if not file_base:
            file_base = os.path.splitext(
                os.path.basename(self.param("conf_file",
                                 PerfDefaults.conf_file)))[0]
        return "{0}-{1}-{2}.tar.gz".format(file_base, version, bucket)

    def _save_snapshot(self, server, bucket, file_base=None):
        """Save data files to a snapshot"""

        src_data_path = os.path.dirname(server.data_path or
                                        testconstants.COUCHBASE_DATA_PATH)
        dest_data_path = "{0}-snapshots".format(src_data_path)

        print "[perf: _save_snapshot] server = {0} , src_data_path = {1}, dest_data_path = {2}"\
            .format(server.ip, src_data_path, dest_data_path)

        shell = RemoteMachineShellConnection(server)

        build_name, short_version, full_version = \
            shell.find_build_version("/opt/couchbase/", "VERSION.txt", "cb")

        dest_file = self._build_tar_name(bucket, full_version, file_base)

        self._exec_and_log(shell, "mkdir -p {0}".format(dest_data_path))

        # save as gzip file, if file exsits, overwrite
        # TODO: multiple buckets
        zip_cmd = "cd {0}; tar -cvzf {1}/{2} {3} {3}-data _*"\
            .format(src_data_path, dest_data_path, dest_file, bucket)
        self._exec_and_log(shell, zip_cmd)

        shell.disconnect()
        return True

    def _load_snapshot(self, server, bucket, file_base=None, overwrite=True):
        """Load data files from a snapshot"""

        dest_data_path = os.path.dirname(server.data_path or
                                         testconstants.COUCHBASE_DATA_PATH)
        src_data_path = "{0}-snapshots".format(dest_data_path)

        print "[perf: _load_snapshot] server = {0} , src_data_path = {1}, dest_data_path = {2}"\
            .format(server.ip, src_data_path, dest_data_path)

        shell = RemoteMachineShellConnection(server)

        build_name, short_version, full_version = \
            shell.find_build_version("/opt/couchbase/", "VERSION.txt", "cb")

        src_file = self._build_tar_name(bucket, full_version, file_base)

        if not shell.file_exists(src_data_path, src_file):
            print "[perf: _load_snapshot] file '{0}/{1}' does not exist"\
                .format(src_data_path, src_file)
            shell.disconnect()
            return False

        if not overwrite:
            self._save_snapshot(server, bucket,
                                "{0}.tar.gz".format(
                                    time.strftime(PerfDefaults.strftime)))  # TODO: filename

        rm_cmd = "rm -rf {0}/{1} {0}/{1}-data {0}/_*".format(dest_data_path,
                                                             bucket)
        self._exec_and_log(shell, rm_cmd)

        unzip_cmd = "cd {0}; tar -xvzf {1}/{2}".format(dest_data_path,
                                                       src_data_path, src_file)
        self._exec_and_log(shell, unzip_cmd)

        shell.disconnect()
        return True

    def save_snapshots(self, file_base, bucket):
        """Save snapshots on all servers"""
        if not self.input.servers or not bucket:
            print "[perf: save_snapshot] invalid server list or bucket name"
            return False

        ClusterOperationHelper.stop_cluster(self.input.servers)

        for server in self.input.servers:
            self._save_snapshot(server, bucket, file_base)

        ClusterOperationHelper.start_cluster(self.input.servers)

        return True

    def load_snapshots(self, file_base, bucket):
        """Load snapshots on all servers"""
        if not self.input.servers or not bucket:
            print "[perf: load_snapshot] invalid server list or bucket name"
            return False

        ClusterOperationHelper.stop_cluster(self.input.servers)

        for server in self.input.servers:
            if not self._load_snapshot(server, bucket, file_base):
                ClusterOperationHelper.start_cluster(self.input.servers)
                return False

        ClusterOperationHelper.start_cluster(self.input.servers)

        return True

    def spec(self, reference):
        self.spec_reference = self.param("spec", reference)
        self.log.info("spec: " + reference)

    def mk_stats(self, verbosity):
        return StatsCollector(verbosity)

    def _get_src_version(self):
        """get testrunner version"""
        try:
            result = subprocess.Popen(['git', 'rev-parse', 'HEAD'],
                                      stdout=subprocess.PIPE).communicate()[0]
        except subprocess.CalledProcessError as e:
            print "[perf] unable to get src code version : {0}".format(str(e))
            return "unknown version"
        return result.rstrip()[:7]

    def start_stats(self, stats_spec, servers=None,
                    process_names=['memcached', 'beam.smp', 'couchjs'],
                    test_params=None, client_id='',
                    collect_server_stats=True, ddoc=None):
        if self.parami('stats', 1) == 0:
            return None

        servers = servers or self.input.servers
        sc = self.mk_stats(False)
        bucket = self.param("bucket", "default")
        sc.start(servers, bucket, process_names, stats_spec, 10, client_id,
                 collect_server_stats=collect_server_stats, ddoc=ddoc)
        test_params['testrunner'] = self._get_src_version()
        self.test_params = test_params
        self.sc = sc
        return self.sc

    def end_stats(self, sc, total_stats=None, stats_spec=None):
        if sc is None:
            return
        if stats_spec is None:
            stats_spec = self.spec_reference
        if total_stats:
            sc.total_stats(total_stats)
        self.log.info("stopping stats collector")
        sc.stop()
        self.log.info("stats collector is stopped")
        sc.export(stats_spec, self.test_params)

    def load(self, num_items, min_value_size=None,
             kind='binary',
             protocol='binary',
             ratio_sets=1.0,
             ratio_hot_sets=0.0,
             ratio_hot_gets=0.0,
             ratio_expirations=0.0,
             expiration=None,
             prefix="",
             doc_cache=1,
             use_direct=True,
             report=0,
             start_at= -1,
             collect_server_stats=True,
             is_eperf=False,
             hot_shift=0):
        cfg = {'max-items': num_items,
               'max-creates': num_items,
               'max-ops-per-sec': self.parami("load_mcsoda_max_ops_sec",
                                              PerfDefaults.mcsoda_max_ops_sec),
               'min-value-size': min_value_size or self.parami("min_value_size",
                                                               1024),
               'ratio-sets': self.paramf("load_ratio_sets", ratio_sets),
               'ratio-misses': self.paramf("load_ratio_misses", 0.0),
               'ratio-creates': self.paramf("load_ratio_creates", 1.0),
               'ratio-deletes': self.paramf("load_ratio_deletes", 0.0),
               'ratio-hot': 0.0,
               'ratio-hot-sets': ratio_hot_sets,
               'ratio-hot-gets': ratio_hot_gets,
               'ratio-expirations': ratio_expirations,
               'expiration': expiration or 0,
               'exit-after-creates': 1,
               'json': int(kind == 'json'),
               'batch': self.parami("batch", PerfDefaults.batch),
               'vbuckets': self.vbucket_count,
               'doc-cache': doc_cache,
               'prefix': prefix,
               'report': report,
               'hot-shift': hot_shift,
               'cluster_name': self.param("cluster_name", "")}
        cur = {}
        if start_at >= 0:
            cur['cur-items'] = start_at
            cur['cur-gets'] = start_at
            cur['cur-sets'] = start_at
            cur['cur-ops'] = cur['cur-gets'] + cur['cur-sets']
            cur['cur-creates'] = start_at
            cfg['max-creates'] = start_at + num_items
            cfg['max-items'] = cfg['max-creates']

        cfg_params = cfg.copy()
        cfg_params['test_time'] = time.time()
        cfg_params['test_name'] = self.id()

        # phase: 'load' or 'reload'
        phase = "load"
        if self.parami("hot_load_phase", 0) == 1:
            phase = "reload"

        if is_eperf:
            collect_server_stats = self.parami("prefix", 0) == 0
            client_id = self.parami("prefix", 0)
            sc = self.start_stats("{0}.{1}".format(self.spec_reference, phase), # stats spec e.x: testname.load
                                  test_params=cfg_params, client_id=client_id,
                                  collect_server_stats=collect_server_stats)

        # For Black box, multi node tests
        # always use membase-binary
        if self.is_multi_node:
            protocol = self.mk_protocol(host=self.input.servers[0].ip,
                                        port=self.input.servers[0].port)

        protocol, host_port, user, pswd = \
            self.protocol_parse(protocol, use_direct=use_direct)

        if not user.strip():
            user = self.input.servers[0].rest_username
        if not pswd.strip():
            pswd = self.input.servers[0].rest_password

        self.log.info("mcsoda - %s %s %s %s" %
                      (protocol, host_port, user, pswd))
        self.log.info("mcsoda - cfg: " + str(cfg))
        self.log.info("mcsoda - cur: " + str(cur))

        cur, start_time, end_time = \
            self.mcsoda_run(cfg, cur, protocol, host_port, user, pswd,
                            heartbeat=self.parami("mcsoda_heartbeat", 0),
                            why="load", bucket=self.param("bucket", "default"))
        self.num_items_loaded = num_items
        ops = {'tot-sets': cur.get('cur-sets', 0),
               'tot-gets': cur.get('cur-gets', 0),
               'tot-items': cur.get('cur-items', 0),
               'tot-creates': cur.get('cur-creates', 0),
               'tot-misses': cur.get('cur-misses', 0),
               "start-time": start_time,
               "end-time": end_time}

        if is_eperf:
            if self.parami("load_wait_until_drained", 1) == 1:
                self.wait_until_drained()
            if self.parami("load_wait_until_repl",
                PerfDefaults.load_wait_until_repl) == 1:
                self.wait_until_repl()
            self.end_stats(sc, ops, "{0}.{1}".format(self.spec_reference,
                                                     phase))

        return ops, start_time, end_time

    def mcsoda_run(self, cfg, cur, protocol, host_port, user, pswd,
                   stats_collector=None, stores=None, ctl=None,
                   heartbeat=0, why="", bucket="default"):
        return mcsoda.run(cfg, cur, protocol, host_port, user, pswd,
                          stats_collector=stats_collector,
                          stores=stores,
                          ctl=ctl,
                          heartbeat=heartbeat,
                          why=why,
                          bucket=bucket)

    def rebalance_nodes(self, num_nodes):
        """Rebalance cluster(s) if more than 1 node provided"""

        if len(self.input.servers) == 1 or num_nodes == 1:
            print "WARNING: running on single node cluster"
            return
        else:
            print "[perf.setUp] rebalancing nodes: num_nodes = {0}".\
                format(num_nodes)

        if self.input.clusters:
            for cluster in self.input.clusters.values():
                status, _ = RebalanceHelper.rebalance_in(cluster,
                                                         num_nodes - 1,
                                                         do_shuffle=False)
                self.assertTrue(status)
        else:
            status, _ = RebalanceHelper.rebalance_in(self.input.servers,
                                                     num_nodes - 1,
                                                     do_shuffle=False)
            self.assertTrue(status)

    @staticmethod
    def delayed_rebalance_worker(servers, num_nodes, delay_seconds, sc,
                                 max_retries=PerfDefaults.reb_max_retries):
        time.sleep(delay_seconds)
        gmt_now = time.strftime(PerfDefaults.strftime, time.gmtime())
        print "[delayed_rebalance_worker] rebalance started: %s" % gmt_now

        if not sc:
            print "[delayed_rebalance_worker] invalid stats collector"
            return
        status = False
        retries = 0
        while not status and retries <= max_retries:
            start_time = time.time()
            status, nodes = RebalanceHelper.rebalance_in(servers,
                                                         num_nodes - 1,
                                                         do_check=(not retries))
            end_time = time.time()
            print "[delayed_rebalance_worker] status: {0}, nodes: {1}, retries: {2}"\
                .format(status, nodes, retries)
            if not status:
                retries += 1
                time.sleep(delay_seconds)
        sc.reb_stats(start_time, end_time - start_time)

    def delayed_rebalance(self, num_nodes, delay_seconds=10,
                          max_retries=PerfDefaults.reb_max_retries,
                          sync=False):
        print "delayed_rebalance"
        if sync:
            PerfBase.delayed_rebalance_worker(self.input.servers,
                    num_nodes, delay_seconds, self.sc, max_retries)
        else:
            t = threading.Thread(target=PerfBase.delayed_rebalance_worker,
                                 args=(self.input.servers, num_nodes,
                                 delay_seconds, self.sc, max_retries))
            t.daemon = True
            t.start()

    @staticmethod
    def set_auto_compaction(server, parallel_compaction, percent_threshold):
        rest = RestConnection(server)
        rest.set_auto_compaction(parallel_compaction,
                                 dbFragmentThresholdPercentage=percent_threshold,
                                 viewFragmntThresholdPercentage=percent_threshold)

    @staticmethod
    def delayed_compaction_worker(servers, parallel_compaction,
                                  percent_threshold, delay_seconds):
        time.sleep(delay_seconds)
        PerfBase.set_auto_compaction(servers[0], parallel_compaction,
                                     percent_threshold)

    def delayed_compaction(self, parallel_compaction="false",
                           percent_threshold=0.01,
                           delay_seconds=10):
        t = threading.Thread(target=PerfBase.delayed_compaction_worker,
                             args=(self.input.servers,
                                   parallel_compaction,
                                   percent_threshold,
                                   delay_seconds))
        t.daemon = True
        t.start()

    def loop(self, num_ops=None,
             num_items=None,
             max_items=None,
             max_creates=None,
             min_value_size=None,
             exit_after_creates=0,
             kind='binary',
             protocol='binary',
             clients=1,
             ratio_misses=0.0,
             ratio_sets=0.0, ratio_creates=0.0, ratio_deletes=0.0,
             ratio_hot=0.2, ratio_hot_sets=0.95, ratio_hot_gets=0.95,
             ratio_expirations=0.0,
             expiration=None,
             test_name=None,
             prefix="",
             doc_cache=1,
             use_direct=True,
             collect_server_stats=True,
             start_at= -1,
             report=0,
             ctl=None,
             hot_shift=0,
             is_eperf=False,
             ratio_queries=0,
             queries=0,
             ddoc=None):
        num_items = num_items or self.num_items_loaded

        hot_stack_size = \
            self.parami('hot_stack_size', PerfDefaults.hot_stack_size) or \
            (num_items * ratio_hot)

        cfg = {'max-items': max_items or num_items,
               'max-creates': max_creates or 0,
               'max-ops-per-sec': self.parami("mcsoda_max_ops_sec",
                                              PerfDefaults.mcsoda_max_ops_sec),
               'min-value-size': min_value_size or self.parami("min_value_size",
                                                               1024),
               'exit-after-creates': exit_after_creates,
               'ratio-sets': ratio_sets,
               'ratio-misses': ratio_misses,
               'ratio-creates': ratio_creates,
               'ratio-deletes': ratio_deletes,
               'ratio-hot': ratio_hot,
               'ratio-hot-sets': ratio_hot_sets,
               'ratio-hot-gets': ratio_hot_gets,
               'ratio-expirations': ratio_expirations,
               'ratio-queries': ratio_queries,
               'expiration': expiration or 0,
               'threads': clients,
               'json': int(kind == 'json'),
               'batch': self.parami("batch", PerfDefaults.batch),
               'vbuckets': self.vbucket_count,
               'doc-cache': doc_cache,
               'prefix': prefix,
               'queries': queries,
               'report': report,
               'hot-shift': hot_shift,
               'hot-stack': self.parami("hot_stack", PerfDefaults.hot_stack),
               'hot-stack-size': hot_stack_size,
               'hot-stack-rotate': self.parami("hot_stack_rotate",
                                               PerfDefaults.hot_stack_rotate),
               'cluster_name': self.param("cluster_name", ""),
               'observe': self.param("observe", PerfDefaults.observe),
               'obs-backoff': self.paramf('obs_backoff',
                                          PerfDefaults.obs_backoff),
               'obs-max-backoff': self.paramf('obs_max_backoff',
                                              PerfDefaults.obs_max_backoff),
               'obs-persist-count': self.parami('obs_persist_count',
                                                PerfDefaults.obs_persist_count),
               'obs-repl-count': self.parami('obs_repl_count',
                                             PerfDefaults.obs_repl_count),
               'woq-pattern': self.parami('woq_pattern',
                                         PerfDefaults.woq_pattern),
               'woq-verbose': self.parami('woq_verbose',
                                         PerfDefaults.woq_verbose),
               'cor-pattern': self.parami('cor_pattern',
                                         PerfDefaults.cor_pattern),
               'cor-persist': self.parami('cor_persist',
                                         PerfDefaults.cor_persist),
               'carbon': self.parami('carbon', PerfDefaults.carbon),
               'carbon-server': self.param('carbon_server',
                                           PerfDefaults.carbon_server),
               'carbon-port': self.parami('carbon_port',
                                          PerfDefaults.carbon_port),
               'carbon-timeout': self.parami('carbon_timeout',
                                             PerfDefaults.carbon_timeout),
               'carbon-cache-size': self.parami('carbon_cache_size',
                                                PerfDefaults.carbon_cache_size),
               'time': self.parami('time', 0)}

        cfg_params = cfg.copy()
        cfg_params['test_time'] = time.time()
        cfg_params['test_name'] = test_name
        client_id = ''
        stores = None

        if is_eperf:
            client_id = self.parami("prefix", 0)
        sc = None
        if self.parami("collect_stats", 1):
            sc = self.start_stats(self.spec_reference + ".loop",
                                  test_params=cfg_params, client_id=client_id,
                                  collect_server_stats=collect_server_stats,
                                  ddoc=ddoc)

        self.cur = {'cur-items': num_items}
        if start_at >= 0:
            self.cur['cur-gets'] = start_at
        if num_ops is None:
            num_ops = num_items
        if isinstance(num_ops, int):
            cfg['max-ops'] = num_ops
        else:
            # Here, we num_ops looks like "time to run" tuple of...
            # ('seconds', integer_num_of_seconds_to_run)
            cfg['time'] = num_ops[1]

        # For Black box, multi node tests
        # always use membase-binary
        if self.is_multi_node:
            protocol = self.mk_protocol(host=self.input.servers[0].ip,
                                        port=self.input.servers[0].port)

        self.log.info("mcsoda - protocol %s" % protocol)
        protocol, host_port, user, pswd = \
            self.protocol_parse(protocol, use_direct=use_direct)

        if not user.strip():
            user = self.input.servers[0].rest_username
        if not pswd.strip():
            pswd = self.input.servers[0].rest_password

        self.log.info("mcsoda - %s %s %s %s" %
                      (protocol, host_port, user, pswd))
        self.log.info("mcsoda - cfg: " + str(cfg))
        self.log.info("mcsoda - cur: " + str(self.cur))

        # For query tests always use StoreCouchbase
        if protocol == "couchbase":
            stores = [StoreCouchbase()]

        self.cur, start_time, end_time = \
            self.mcsoda_run(cfg, self.cur, protocol, host_port, user, pswd,
                            stats_collector=sc, ctl=ctl, stores=stores,
                            heartbeat=self.parami("mcsoda_heartbeat", 0),
                            why="loop", bucket=self.param("bucket", "default"))

        ops = {'tot-sets': self.cur.get('cur-sets', 0),
               'tot-gets': self.cur.get('cur-gets', 0),
               'tot-items': self.cur.get('cur-items', 0),
               'tot-creates': self.cur.get('cur-creates', 0),
               'tot-misses': self.cur.get('cur-misses', 0),
               "start-time": start_time,
               "end-time": end_time}

        # Wait until there are no active indexing tasks
        if self.parami('wait_for_indexer', 0):
            ClusterOperationHelper.wait_for_completion(self.rest, 'indexer')

        # Wait until there are no active view compaction tasks
        if self.parami('wait_for_compaction', 0):
            ClusterOperationHelper.wait_for_completion(self.rest,
                                                       'view_compaction')

        if self.parami("loop_wait_until_drained",
                       PerfDefaults.loop_wait_until_drained):
            self.wait_until_drained()

        if self.parami("loop_wait_until_repl",
                       PerfDefaults.loop_wait_until_repl):
            self.wait_until_repl()

        if self.parami("collect_stats", 1) and \
                not self.parami("reb_no_fg", PerfDefaults.reb_no_fg):
            self.end_stats(sc, ops, self.spec_reference + ".loop")

        return ops, start_time, end_time

    def wait_until_drained(self):
        print "[perf.drain] draining disk write queue : %s"\
            % time.strftime(PerfDefaults.strftime)

        master = self.input.servers[0]
        bucket = self.param("bucket", "default")

        RebalanceHelper.wait_for_stats_on_all(master, bucket,
                                              'ep_queue_size', 0,
                                              fn=RebalanceHelper.wait_for_stats_no_timeout)
        RebalanceHelper.wait_for_stats_on_all(master, bucket,
                                              'ep_flusher_todo', 0,
                                              fn=RebalanceHelper.wait_for_stats_no_timeout)

        print "[perf.drain] disk write queue has been drained: %s"\
            % time.strftime(PerfDefaults.strftime)

        return time.time()

    def wait_until_repl(self):
        print "[perf.repl] waiting for replication: %s"\
            % time.strftime(PerfDefaults.strftime)

        master = self.input.servers[0]
        bucket = self.param("bucket", "default")

        RebalanceHelper.wait_for_stats_on_all(master, bucket,
            'vb_replica_queue_size', 0,
            fn=RebalanceHelper.wait_for_stats_no_timeout)

        RebalanceHelper.wait_for_stats_on_all(master, bucket,
            'ep_tap_replica_queue_itemondisk', 0,
            fn=RebalanceHelper.wait_for_stats_no_timeout)

        RebalanceHelper.wait_for_stats_on_all(master, bucket,
            'ep_tap_rebalance_queue_backfillremaining', 0,
            fn=RebalanceHelper.wait_for_stats_no_timeout)

        RebalanceHelper.wait_for_stats_on_all(master, bucket,
            'ep_tap_replica_qlen', 0,
            fn=RebalanceHelper.wait_for_stats_no_timeout)

        print "[perf.repl] replication is done: %s"\
            % time.strftime(PerfDefaults.strftime)

    def warmup(self, collect_stats=True, flush_os_cache=False):
        """
        Restart cluster and wait for it to warm up.
        In current version, affect the master node only.
        """
        if not self.input.servers:
            print "[warmup error] empty server list"
            return

        if collect_stats:
            client_id = self.parami("prefix", 0)
            test_params = {'test_time': time.time(),
                           'test_name': self.id(),
                           'json': 0}
            sc = self.start_stats(self.spec_reference + ".warmup",
                                  test_params=test_params,
                                  client_id=client_id)

        print "[warmup] preparing to warmup cluster ..."

        server = self.input.servers[0]
        shell = RemoteMachineShellConnection(server)

        start_time = time.time()

        print "[warmup] stopping couchbase ... ({0}, {1})"\
            .format(server.ip, time.strftime(PerfDefaults.strftime))
        shell.stop_couchbase()
        print "[warmup] couchbase stopped ({0}, {1})"\
            .format(server.ip, time.strftime(PerfDefaults.strftime))

        if flush_os_cache:
            print "[warmup] flushing os cache ..."
            shell.flush_os_caches()

        shell.start_couchbase()
        print "[warmup] couchbase restarted ({0}, {1})"\
            .format(server.ip, time.strftime(PerfDefaults.strftime))

        self.wait_until_warmed_up()
        print "[warmup] warmup finished"

        end_time = time.time()
        ops = {'tot-sets': 0,
               'tot-gets': 0,
               'tot-items': 0,
               'tot-creates': 0,
               'tot-misses': 0,
               "start-time": start_time,
               "end-time": end_time}

        if collect_stats:
            self.end_stats(sc, ops, self.spec_reference + ".warmup")

    def wait_until_warmed_up(self, master=None):
        if not master:
            master = self.input.servers[0]

        bucket = self.param("bucket", "default")

        fn = RebalanceHelper.wait_for_mc_stats_no_timeout
        for bucket in self.buckets:
            RebalanceHelper.wait_for_stats_on_all(master, bucket,
                                                  'ep_warmup_thread',
                                                  'complete', fn=fn)

    def param(self, name, default_value):
        input = getattr(self, "input", TestInputSingleton.input)
        return input.test_params.get(name, default_value)

    def parami(self, name, default_int):
        return int(self.param(name, default_int))

    def paramf(self, name, default_float):
        return float(self.param(name, default_float))

    def params(self, name, default_str):
        return str(self.param(name, default_str))
Ejemplo n.º 59
0
    def _install_and_upgrade(self, initial_version='1.6.5.3',
                             create_buckets=False,
                             insert_data=False,
                             start_upgraded_first=True,
                             load_ratio=-1,
                             roll_upgrade=False,
                             upgrade_path=[]):
        node_upgrade_path = []
        node_upgrade_path.extend(upgrade_path)
        #then start them in whatever order you want
        inserted_keys = []
        log = logger.Logger.get_logger()
        if roll_upgrade:
            log.info("performing a rolling upgrade")
        input = TestInputSingleton.input
        rest_settings = input.membase_settings
        servers = input.servers
        save_upgrade_config = False
        is_amazon = False
        if input.test_params.get('amazon',False):
            is_amazon = True
        # install older build on all nodes
        for server in servers:
            remote = RemoteMachineShellConnection(server)
            rest = RestConnection(server)
            info = remote.extract_remote_info()
            older_build = BuildQuery().find_membase_release_build(deliverable_type=info.deliverable_type,
                                                              os_architecture=info.architecture_type,
                                                              build_version=initial_version,
                                                              product='membase-server-enterprise', is_amazon=is_amazon)

            remote.membase_uninstall()
            remote.couchbase_uninstall()
            remote.execute_command('/etc/init.d/membase-server stop')
            remote.download_build(older_build)
            #now let's install ?
            remote.membase_install(older_build)
            RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT)
            rest.init_cluster_port(rest_settings.rest_username, rest_settings.rest_password)
            rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved)
            remote.disconnect()

        bucket_data = {}
        master = servers[0]
        if create_buckets:
            #let's create buckets
            #wait for the bucket
            #bucket port should also be configurable , pass it as the
            #parameter to this test ? later

            self._create_default_bucket(master)
            inserted_keys = self._load_data(master, load_ratio)
            _create_load_multiple_bucket(self, master, bucket_data, howmany=2)

        # cluster all the nodes together
        ClusterOperationHelper.add_all_nodes_or_assert(master,
                                                       servers,
                                                       rest_settings, self)
        rest = RestConnection(master)
        nodes = rest.node_statuses()
        otpNodeIds = []
        for node in nodes:
            otpNodeIds.append(node.id)
        rebalanceStarted = rest.rebalance(otpNodeIds, [])
        self.assertTrue(rebalanceStarted,
                        "unable to start rebalance on master node {0}".format(master.ip))
        log.info('started rebalance operation on master node {0}'.format(master.ip))
        rebalanceSucceeded = rest.monitorRebalance()
        self.assertTrue(rebalanceSucceeded,
                        "rebalance operation for nodes: {0} was not successful".format(otpNodeIds))

        if initial_version == "1.7.0" or initial_version == "1.7.1":
            self._save_config(rest_settings, master)

        input_version = input.test_params['version']
        node_upgrade_path.append(input_version)
        #if we dont want to do roll_upgrade ?
        log.info("Upgrade path: {0} -> {1}".format(initial_version, node_upgrade_path))
        log.info("List of servers {0}".format(servers))
        if not roll_upgrade:
            for version in node_upgrade_path:
                if version is not initial_version:
                    log.info("Upgrading to version {0}".format(version))
                    self._stop_membase_servers(servers)
                    if re.search('1.8', version):
                        save_upgrade_config = True

                    appropriate_build = _get_build(servers[0], version, is_amazon=is_amazon)
                    self.assertTrue(appropriate_build.url, msg="unable to find build {0}".format(version))
                    for server in servers:
                        remote = RemoteMachineShellConnection(server)
                        remote.download_build(appropriate_build)
                        remote.membase_upgrade(appropriate_build, save_upgrade_config=save_upgrade_config)
                        RestHelper(RestConnection(server)).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT)

                        #verify admin_creds still set
                        pools_info = RestConnection(server).get_pools_info()
                        self.assertTrue(pools_info['implementationVersion'], appropriate_build.product_version)

                        if start_upgraded_first:
                            log.info("Starting server {0} post upgrade".format(server))
                            remote.start_membase()
                        else:
                            remote.stop_membase()

                        remote.disconnect()
                    if not start_upgraded_first:
                        log.info("Starting all servers together")
                        self._start_membase_servers(servers)
                    time.sleep(TIMEOUT_SECS)
                    if version == "1.7.0" or version == "1.7.1":
                        self._save_config(rest_settings, master)

                    if create_buckets:
                        self.assertTrue(BucketOperationHelper.wait_for_bucket_creation('default', RestConnection(master)),
                                        msg="bucket 'default' does not exist..")
                    if insert_data:
                        self._verify_data(master, rest, inserted_keys)

        # rolling upgrade
        else:
            version = input.test_params['version']
            appropriate_build = _get_build(servers[0], version, is_amazon=is_amazon)
            self.assertTrue(appropriate_build.url, msg="unable to find build {0}".format(version))
            # rebalance node out
            # remove membase from node
            # install destination version onto node
            # rebalance it back into the cluster
            for server_index in range(len(servers)):
                server = servers[server_index]
                master = servers[server_index - 1]
                log.info("current master is {0}, rolling node is {1}".format(master, server))

                rest = RestConnection(master)
                nodes = rest.node_statuses()
                allNodes = []
                toBeEjectedNodes = []
                for node in nodes:
                    allNodes.append(node.id)
                    if "{0}:{1}".format(node.ip, node.port) == "{0}:{1}".format(server.ip, server.port):
                        toBeEjectedNodes.append(node.id)
                helper = RestHelper(rest)
                removed = helper.remove_nodes(knownNodes=allNodes, ejectedNodes=toBeEjectedNodes)
                self.assertTrue(removed, msg="Unable to remove nodes {0}".format(toBeEjectedNodes))
                remote = RemoteMachineShellConnection(server)
                remote.membase_uninstall()
                remote.couchbase_uninstall()
                remote.download_build(appropriate_build)
                remote.membase_install(appropriate_build)
                RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT)
                log.info("sleep for 10 seconds to wait for membase-server to start...")
                time.sleep(TIMEOUT_SECS)
                rest.init_cluster_port(rest_settings.rest_username, rest_settings.rest_password)
                rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved)
                remote.disconnect()

                #readd this to the cluster
                ClusterOperationHelper.add_all_nodes_or_assert(master, [server], rest_settings, self)
                nodes = rest.node_statuses()
                otpNodeIds = []
                for node in nodes:
                    otpNodeIds.append(node.id)
                rebalanceStarted = rest.rebalance(otpNodeIds, [])
                self.assertTrue(rebalanceStarted,
                                "unable to start rebalance on master node {0}".format(master.ip))
                log.info('started rebalance operation on master node {0}'.format(master.ip))
                rebalanceSucceeded = rest.monitorRebalance()
                self.assertTrue(rebalanceSucceeded,
                                "rebalance operation for nodes: {0} was not successful".format(otpNodeIds))
                #ClusterOperationHelper.verify_persistence(servers, self)

            #TODO: how can i verify that the cluster init config is preserved
            # verify data on upgraded nodes
            if create_buckets:
                self.assertTrue(BucketOperationHelper.wait_for_bucket_creation('default', RestConnection(master)),
                                msg="bucket 'default' does not exist..")
            if insert_data:
                self._verify_data(master, rest, inserted_keys)
                rest = RestConnection(master)
                buckets = rest.get_buckets()
                for bucket in buckets:
                    BucketOperationHelper.keys_exist_or_assert(bucket_data[bucket.name]["inserted_keys"],
                                                               master,
                                                               bucket.name, self)
Ejemplo n.º 60
0
 def set_up_rest(self, master):
     self.rest = RestConnection(master)
     self.rest_helper = RestHelper(self.rest)