Ejemplo n.º 1
0
    def rebalance_swap(servers, how_many, monitor=True):
        if how_many < 1:
            log.error("failed to swap rebalance %s servers - invalid count" % how_many)
            return False, []

        rest = RestConnection(servers[0])
        cur_nodes = rest.node_statuses()
        cur_ips = map(lambda node: node.ip, cur_nodes)
        cur_ids = map(lambda node: node.id, cur_nodes)
        free_servers = filter(lambda server: server.ip not in cur_ips, servers)

        if len(cur_ids) <= how_many or len(free_servers) < how_many:
            log.error("failed to swap rebalance %s servers - not enough servers" % how_many)
            return False, []

        ejections = cur_ids[-how_many:]
        additions = free_servers[:how_many]

        log.info("swap rebalance: cur: %s, eject: %s, add: %s" % (cur_ids, ejections, additions))

        try:
            map(
                lambda server: rest.add_node(
                    servers[0].rest_username, servers[0].rest_password, server.ip, server.port
                ),
                additions,
            )
        except (ServerAlreadyJoinedException, ServerSelfJoinException, AddNodeException), e:
            log.error("failed to swap rebalance - addition failed %s: %s" % (additions, e))
            return False, []
Ejemplo n.º 2
0
 def test_oom_increase_mem_quota(self):
     """
     1. Get OOM
     2. Drop Already existing indexes
     3. Verify if state of indexes is changed to ready
     :return:
     """
     self.assertTrue(self._push_indexer_off_the_cliff(), "OOM Can't be achieved")
     indexer_memQuota = self.get_indexer_mem_quota()
     log.info("Current Indexer Memory Quota is {}".format(indexer_memQuota))
     for cnt in range(5):
         indexer_memQuota += 200
         log.info("Increasing Indexer Memory Quota to {0}".format(indexer_memQuota))
         rest = RestConnection(self.oomServer)
         rest.set_indexer_memoryQuota(indexMemoryQuota=indexer_memQuota)
         self.sleep(120)
         indexer_oom = self._validate_indexer_status_oom()
         if not indexer_oom:
             log.info("Indexer out of OOM...")
             break
     self.assertFalse(self._validate_indexer_status_oom(), "Indexer still in OOM")
     self.sleep(120)
     self._verify_bucket_count_with_index_count(self.load_query_definitions)
     self.multi_query_using_index(buckets=self.buckets,
                 query_definitions=self.load_query_definitions)
Ejemplo n.º 3
0
 def setUp(self):
     super(SecondaryIndexMemdbOomTests, self).setUp()
     self.indexMemQuota = self.input.param("indexMemQuota", 256)
     query_template = QUERY_TEMPLATE
     self.query_template = query_template.format("job_title")
     self.doc_ops = self.input.param("doc_ops", True)
     self.initial_index_number = self.input.param("initial_index_number", 2)
     self.oomServer = self.get_nodes_from_services_map(service_type="index")
     self.whereCondition= self.input.param("whereCondition"," job_title != \"Sales\" ")
     self.query_template += " WHERE {0}".format(self.whereCondition)
     rest = RestConnection(self.oomServer)
     if self.indexMemQuota > 256:
         log.info("Setting indexer memory quota to {0} MB...".format(self.indexMemQuota))
         rest.set_indexer_memoryQuota(indexMemoryQuota=self.indexMemQuota)
         self.sleep(30)
     self.deploy_node_info = ["{0}:{1}".format(self.oomServer.ip, self.oomServer.port)]
     self.load_query_definitions = []
     for x in range(self.initial_index_number):
         index_name = "index_name_"+str(x)
         query_definition = QueryDefinition(index_name=index_name, index_fields = ["job_title"],
                     query_template = self.query_template, groups = ["simple"])
         self.load_query_definitions.append(query_definition)
     self._initialize_multi_create_index(buckets = self.buckets,
                                        query_definitions = self.load_query_definitions,
                                        deploy_node_info=self.deploy_node_info)
     log.info("Setting indexer memory quota to 300 MB...")
     rest.set_indexer_memoryQuota(indexMemoryQuota=300)
     self.sleep(30)
Ejemplo n.º 4
0
    def pick_node(master):
        rest = RestConnection(master)
        nodes = rest.node_statuses()
        node_picked = None
        nodes_on_same_ip = True

        firstIp = nodes[0].ip
        for node in nodes:
            if node.ip != firstIp:
                nodes_on_same_ip = False
                break

        for node in nodes:
            node_picked = node
            if not nodes_on_same_ip:
                if node_picked.ip != master.ip:
                    log.info("Picked node ... {0}:{1}".format(node_picked.ip, node_picked.port))
                    break
            else:
                # temp fix - port numbers of master(machine ip and localhost: 9000 match
                if int(node_picked.port) == int(master.port):
                    log.info(
                        "Not picking the master node {0}:{1}.. try again...".format(node_picked.ip, node_picked.port)
                    )
                else:
                    log.info("Picked  node {0}:{1}".format(node_picked.ip, node_picked.port))
                    break
        return node_picked
Ejemplo n.º 5
0
 def test_change_mem_quota_when_index_building(self):
     rest = RestConnection(self.oomServer)
     log.info("Setting indexer memory quota to 700 MB...")
     rest.set_indexer_memoryQuota(indexMemoryQuota=700)
     self.sleep(30)
     query_definitions = []
     for x in range(3):
         index_name = "index_"+str(x)
         query_definition = QueryDefinition(index_name=index_name, index_fields = ["job_title"],
                     query_template = self.query_template, groups = ["simple"])
         query_definitions.append(query_definition)
     create_tasks = []
     build_tasks = []
     index_info = {}
     for bucket in self.buckets:
         if not bucket in index_info.keys():
             index_info[bucket] = []
         for query_definition in query_definitions:
             index_info[bucket].append(query_definition.index_name)
             task = self.async_create_index(bucket.name, query_definition)
             create_tasks.append(task)
     for task in create_tasks:
         task.result()
     if self.defer_build:
         log.info("Building Indexes...")
         for key, val in index_info.iteritems():
             task = self.async_build_index(bucket=key, index_list=val)
             build_tasks.append(task)
     self.sleep(10)
     log.info("Setting indexer memory quota to 500 MB...")
     rest.set_indexer_memoryQuota(indexMemoryQuota=500)
     self.sleep(30)
     for task in build_tasks:
         task.result()
Ejemplo n.º 6
0
    def test_zone_enable_after_upgrade_from_ce_to_ee(self):
        params = {}
        params['product'] = self.product
        params['version'] = self.version
        params['vbuckets'] = [self.vbuckets]
        params['type'] = self.type
        """ install couchbasse server community edition to run the test """
        InstallerJob().parallel_install(self.servers[:3], params)

        params["type"] = "enterprise"
        zone_name = "AAABBBCCCaakkkkmmm345672"
        serverInfo = self.servers[0]
        ini_servers = self.servers[:self.nodes_init]
        rest = RestConnection(serverInfo)
        self.user = serverInfo.rest_username
        self.password = serverInfo.rest_password
        if len(ini_servers) > 1:
            self.cluster.rebalance([ini_servers[0]], ini_servers[1:], [])
        rest = RestConnection(self.master)
        self._bucket_creation()

        """ verify all nodes in cluster in CE """
        if rest.is_enterprise_edition():
            raise Exception("This test needs couchbase server community edition to run")

        self._load_all_buckets(self.servers[0], self.gen_load, "create", 0)
        try:
            self.log.info("create zone {0}".format(zone_name))
            result = rest.add_zone(zone_name)
            if result:
                raise Exception("Zone feature should not be available in CE version")
        except Exception,e :
            if "Failed" in e:
                pass
Ejemplo n.º 7
0
 def _verify_zone(self, name):
     serverInfo = self.servers[0]
     rest = RestConnection(serverInfo)
     if rest.is_zone_exist(name.strip()):
         self.log.info("verified! zone '{0}' is existed".format(name.strip()))
     else:
         raise Exception("There is not zone with name: %s in cluster" % name)
Ejemplo n.º 8
0
    def rebalance_in_out_at_once_with_max_buckets_number(self):
        servs_init = self.servers[:self.nodes_init]
        servs_in = [self.servers[i + self.nodes_init] for i in range(self.nodes_in)]
        servs_out = [self.servers[self.nodes_init - i - 1] for i in range(self.nodes_out)]
        rest = RestConnection(self.master)
        self._wait_for_stats_all_buckets(servs_init)
        self.log.info("current nodes : {0}".format([node.id for node in rest.node_statuses()]))
        self.log.info("adding nodes {0} to cluster".format(servs_in))
        self.log.info("removing nodes {0} from cluster".format(servs_out))
        result_nodes = set(servs_init + servs_in) - set(servs_out)

        rest = RestConnection(self.master)
        bucket_num = rest.get_internalSettings("maxBucketCount")
        self.bucket_size = self.quota / bucket_num

        self.log.info('total %s buckets will be created with size %s MB' % (bucket_num, self.bucket_size))
        self.cluster.create_default_bucket(self.master, self.bucket_size, self.num_replicas)
        self.buckets.append(Bucket(name="default", authType="sasl", saslPassword="",
                                       num_replicas=self.num_replicas, bucket_size=self.bucket_size))
        self._create_sasl_buckets(self.master, (bucket_num - 1) / 2)
        self._create_standard_buckets(self.master, bucket_num - 1 - (bucket_num - 1) / 2)

        gen = BlobGenerator('mike', 'mike-', self.value_size, end=self.num_items)
        self._load_all_buckets(self.master, gen, "create", 0)
        self._wait_for_stats_all_buckets(servs_init)

        rebalance = self.cluster.async_rebalance(servs_init, servs_in, servs_out)
        self._async_load_all_buckets(self.master, gen, "update", 0)
        rebalance.result()
        self.verify_cluster_stats(result_nodes)
Ejemplo n.º 9
0
 def test_delete_empty_defautl_zone(self):
     zone_name ="test1"
     default_zone = "Group 1"
     moved_node = []
     serverInfo = self.servers[0]
     moved_node.append(serverInfo.ip)
     rest = RestConnection(serverInfo)
     try:
         self.log.info("create zone {0}".format(zone_name))
         rest.add_zone(zone_name)
         if rest.is_zone_exist(zone_name):
             self.log.info("Move node {0} from zone {1} to zone {2}" \
                           .format(moved_node, default_zone, zone_name))
             status = rest.shuffle_nodes_in_zones(moved_node, default_zone, zone_name)
             if status:
                 rest.delete_zone(default_zone)
             else:
                 self.fail("Failed to move node {0} from zone {1} to zone {2}" \
                           .format(moved_node, default_zone, zone_name))
             if not rest.is_zone_exist(default_zone):
                 self.log.info("successful delete default zone")
             else:
                 raise Exception("Failed to delete default zone")
         rest.rename_zone(zone_name, default_zone)
     except Exception,e :
         print e
Ejemplo n.º 10
0
 def test_fileRotate20MB(self):
     auditIns = audit(host=self.master)
     firstEventTime = self.getTimeStampForFile(auditIns)
     tempEventCounter = 0
     rest = RestConnection(self.master)
     shell = RemoteMachineShellConnection(self.master)
     filePath = auditIns.pathLogFile + auditIns.AUDITLOGFILENAME
     number = int (shell.get_data_file_size(filePath))
     hostname = shell.execute_command("hostname")
     archiveFile = hostname[0][0] + '-' + firstEventTime + "-audit.log"
     result = shell.file_exists(auditIns.pathLogFile, archiveFile)
     tempTime = 0
     starttime = time.time()
     while ((number < 21089520) and (tempTime < 36000) and (result == False)):
         for i in range(1, 10):
             status, content = rest.validateLogin("Administrator", "password", True, getContent=True)
             tempEventCounter += 1
             number = int (shell.get_data_file_size(filePath))
             currTime = time.time()
             tempTime = int (currTime - starttime)
             result = shell.file_exists(auditIns.pathLogFile, archiveFile)
     self.sleep(30)
     result = shell.file_exists(auditIns.pathLogFile, archiveFile)
     shell.disconnect()
     self.log.info ("--------Total Event Created ---- {0}".format(tempEventCounter))
     self.assertTrue(result, "Archive Audit.log is not created on reaching 20MB threshhold")
Ejemplo n.º 11
0
    def rebalance_in_out_at_once_persistence_stopped(self):
        num_nodes_with_stopped_persistence = self.input.param("num_nodes_with_stopped_persistence", 1)
        servs_init = self.servers[:self.nodes_init]
        servs_in = [self.servers[i + self.nodes_init] for i in range(self.nodes_in)]
        servs_out = [self.servers[self.nodes_init - i - 1] for i in range(self.nodes_out)]
        rest = RestConnection(self.master)
        self._wait_for_stats_all_buckets(servs_init)
        for server in servs_init[:min(num_nodes_with_stopped_persistence, self.nodes_init)]:
            shell = RemoteMachineShellConnection(server)
            for bucket in self.buckets:
                shell.execute_cbepctl(bucket, "stop", "", "", "")
        self.sleep(5)
        self.num_items_without_persistence = self.input.param("num_items_without_persistence", 100000)
        gen_extra = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2\
                                      , end=self.num_items / 2 + self.num_items_without_persistence)
        self.log.info("current nodes : {0}".format([node.id for node in rest.node_statuses()]))
        self.log.info("adding nodes {0} to cluster".format(servs_in))
        self.log.info("removing nodes {0} from cluster".format(servs_out))
        tasks = self._async_load_all_buckets(self.master, gen_extra, "create", 0, batch_size=1000)
        result_nodes = set(servs_init + servs_in) - set(servs_out)
        # wait timeout in 60 min because MB-7386 rebalance stuck
        self.cluster.rebalance(servs_init[:self.nodes_init], servs_in, servs_out, timeout=self.wait_timeout * 60)
        for task in tasks:
            task.result()

        self._wait_for_stats_all_buckets(servs_init[:self.nodes_init - self.nodes_out], \
                                         ep_queue_size=self.num_items_without_persistence * 0.9, ep_queue_size_cond='>')
        self._wait_for_stats_all_buckets(servs_in)
        self._verify_all_buckets(self.master, timeout=None)
        self._verify_stats_all_buckets(result_nodes)
        #verify that curr_items_tot corresponds to sum of curr_items from all nodes
        verified = True
        for bucket in self.buckets:
            verified &= RebalanceHelper.wait_till_total_numbers_match(self.master, bucket)
        self.assertTrue(verified, "Lost items!!! Replication was completed but sum(curr_items) don't match the curr_items_total")
Ejemplo n.º 12
0
    def test_rotateIntervalCluster(self):
        intervalSec = self.input.param("intervalSec", None)
        nodes_init = self.input.param("nodes_init", 2)
        auditIns = audit(host=self.master)
	auditIns.setAuditEnable('true')
        originalInt = auditIns.getAuditRotateInterval()
        auditIns.setAuditRotateInterval(intervalSec)
        firstEventTime = []

        try:
            for i in range(len(self.servers[:nodes_init])):
                auditTemp = audit(host=self.servers[i])
                firstEventTime.append(self.getTimeStampForFile(auditTemp))

            self.sleep(intervalSec + 20, 'Sleep for log roll over to happen')

            for i in range(len(self.servers[:nodes_init])):
                shell = RemoteMachineShellConnection(self.servers[i])
                rest = RestConnection(self.servers[i])
                status, content = rest.validateLogin(self.master.rest_username, self.master.rest_password, True, getContent=True)
                self.sleep(120, "sleeping for log file creation")
                try:
                    hostname = shell.execute_command("hostname")
                    self.log.info ("print firstEventTime {0}".format(firstEventTime[i]))
                    archiveFile = hostname[0][0] + '-' + firstEventTime[i] + "-audit.log"
                    self.log.info ("Archive File Name is {0}".format(archiveFile))
                    result = shell.file_exists(auditIns.pathLogFile, archiveFile)
                    self.assertTrue(result, "Archive Audit.log is not created on time interval")
                    self.log.info ("Validation of archive File created is True, Audit archive File is created {0}".format(archiveFile))
                    result = shell.file_exists(auditIns.pathLogFile, auditIns.AUDITLOGFILENAME)
                    self.assertTrue(result, "Audit.log is not created as per the roll over time specified")
                finally:
                    shell.disconnect()
        finally:
            auditIns.setAuditRotateInterval(originalInt)
Ejemplo n.º 13
0
    def test_folderMisMatchCluster(self):
        auditIns = audit(host=self.master)
        orginalPath = auditIns.getAuditLogPath()
        newPath = originalPath + 'testFolderMisMatch'
        shell = RemoteMachineShellConnection(self.servers[0])
        try:
            shell.create_directory(newPath)
            command = 'chown couchbase:couchbase ' + newPath
            shell.execute_command(command)
        finally:
            shell.disconnect()

        auditIns.setsetAuditLogPath(newPath)

        for server in self.servers:
            rest = RestConnection(sever)
            #Create an Event for Bucket Creation
            expectedResults = {'name':'TestBucket ' + server.ip, 'ram_quota':536870912, 'num_replicas':1,
                                       'replica_index':False, 'eviction_policy':'value_only', 'type':'membase', \
                                       'auth_type':'sasl', "autocompaction":'false', "purge_interval":"undefined", \
                                        "flush_enabled":False, "num_threads":3, "source":source, \
                                       "user":user, "ip":self.ipAddress, "port":57457, 'sessionid':'' }
            rest.create_bucket(expectedResults['name'], expectedResults['ram_quota'] / 1048576, expectedResults['auth_type'], 'password', expectedResults['num_replicas'], \
                                       '11211', 'membase', 0, expectedResults['num_threads'], expectedResults['flush_enabled'], 'valueOnly')

            #Check on Events
            try:
                self.checkConfig(self.eventID, self.servers[0], expectedResults)
            except:
                self.log.info ("Issue reading the file at Node {0}".format(server.ip))
Ejemplo n.º 14
0
 def test_invalidLogPathCluster(self):
     auditIns = audit(host=self.master)
     newPath = auditIns.getAuditLogPath() + 'test'
     rest = RestConnection(self.master)
     status, content = rest.setAuditSettings(logPath=newPath)
     self.assertFalse(status, "Audit is able to set invalid path")
     self.assertEqual(content['errors']['logPath'], 'The value must be a valid directory', 'No error or error changed')
Ejemplo n.º 15
0
 def test_rotateInterval(self):
     intervalSec = self.input.param("intervalSec", None)
     auditIns = audit(host=self.master)
     rest = RestConnection(self.master)
     originalInt = auditIns.getAuditRotateInterval()
     try:
         firstEventTime = self.getTimeStampForFile(auditIns)
         self.log.info ("first time evetn is {0}".format(firstEventTime))
         auditIns.setAuditRotateInterval(intervalSec)
         self.sleep(intervalSec + 20, 'Sleep for log roll over to happen')
         status, content = rest.validateLogin(self.master.rest_username, self.master.rest_password, True, getContent=True)
         self.sleep(120)
         shell = RemoteMachineShellConnection(self.master)
         try:
             hostname = shell.execute_command("hostname")
             archiveFile = hostname[0][0] + '-' + firstEventTime + "-audit.log"
             self.log.info ("Archive File Name is {0}".format(archiveFile))
             result = shell.file_exists(auditIns.pathLogFile, archiveFile)
             self.assertTrue(result, "Archive Audit.log is not created on time interval")
             self.log.info ("Validation of archive File created is True, Audit archive File is created {0}".format(archiveFile))
             result = shell.file_exists(auditIns.pathLogFile, auditIns.AUDITLOGFILENAME)
             self.assertTrue(result, "Audit.log is not created when memcached server is killed")
         finally:
             shell.disconnect()
     finally:
         auditIns.setAuditRotateInterval(originalInt)
Ejemplo n.º 16
0
    def test_root_crt_rotate_cluster(self):
        rest = RestConnection(self.master)
        x509main(self.master).setup_master()
        x509main().setup_cluster_nodes_ssl(self.servers)
        rest.create_bucket(bucket='default', ramQuotaMB=100)
        self.sleep(30)
        servers_in = self.servers[1:]
        self.cluster.rebalance(self.servers, servers_in, [])

        for server in self.servers:
            result  = self._sdk_connection(host_ip=server.ip)
            self.assertTrue(result,"Can create a ssl connection with correct certificate")

        result,cb   = self._sdk_connection(host_ip=self.master.ip)
        create_docs = Thread(name='create_docs', target=self.createBulkDocuments, args=(cb,))
        create_docs.start()

        x509main(self.master)._delete_inbox_folder()
        x509main(self.master)._generate_cert(self.servers,root_cn="CB\ Authority")
        x509main(self.master).setup_master()
        x509main().setup_cluster_nodes_ssl(self.servers,reload_cert=True)


        create_docs.join()

        for server in self.servers:
            result  = self._sdk_connection(host_ip=server.ip)
            self.assertTrue(result,"Can create a ssl connection with correct certificate")
Ejemplo n.º 17
0
 def _create_buckets(self, nodes):
     master_node = nodes[0]
     num_buckets = 0
     if self._default_bucket:
         num_buckets += 1
     num_buckets += self._sasl_buckets + self._standard_buckets
     bucket_size = self._get_bucket_size(master_node, nodes, self._mem_quota_int, num_buckets)
     rest = RestConnection(master_node)
     master_id = rest.get_nodes_self().id
     if self._default_bucket:
         if self._default_quota != 0:
             bucket_size = self._default_quota
         rest = RestConnection(nodes[0])
         rest.create_bucket(
             bucket=self.default_bucket_name,
             ramQuotaMB=bucket_size,
             replicaNumber=self._num_replicas,
             proxyPort=11211,
             authType="none",
             saslPassword=None,
         )
         self._buckets.append(self.default_bucket_name)
     if self._sasl_buckets > 0:
         if self._sasl_quota != 0:
             bucket_size = self._sasl_quota
         self._create_sasl_buckets(master_node, master_id, bucket_size, password="******")
     if self._standard_buckets > 0:
         if self._standard_quota != 0:
             bucket_size = self._standard_quota
         self._create_standard_buckets(master_node, master_id, bucket_size)
Ejemplo n.º 18
0
    def test_compare_views_all_nodes_x_docs(self):
        num_docs = self.helper.input.param("num-docs", 100)
        self.log.info("description : creates view on {0} documents, queries "
                      "all nodes (not only the master node) and compares "
                      "if the results are all the same"\
                          .format(num_docs))
        design_name = "dev_test_compare_views_{0}_docs".format(num_docs)
        prefix = str(uuid.uuid4())[:7]

        inserted_keys = self._setup_index(design_name, num_docs, prefix)

        nodes = self.helper.rest.get_nodes()
        params = {"connection_timeout": 60000, "full_set": True}

        # Query every single node and verify
        for n in nodes:
            n_rest = RestConnection({
                    "ip": n.ip,
                    "port": n.port,
                    "username": self.helper.master.rest_username,
                    "password": self.helper.master.rest_password})
            results = n_rest.spatial_results(self.helper.bucket, design_name,
                                             params, None)
            result_keys = self.helper.get_keys(results)
            self.helper.verify_result(inserted_keys, result_keys)
Ejemplo n.º 19
0
 def test_get_cluster_ca_self_signed(self):
     rest = RestConnection(self.master)
     rest.regenerate_cluster_certificate()
     status, content, header = x509main(self.master)._get_cluster_ca_cert()
     content = json.loads(content)
     self.assertTrue(status,"Issue while Cluster CA Cert")
     self.assertEqual(content['cert']['type'],"generated","Type of certificate is mismatch")
Ejemplo n.º 20
0
 def items_verification(test, master):
     rest = RestConnection(master)
     # Verify items count across all node
     timeout = 600
     for bucket in rest.get_buckets():
         verified = RebalanceHelper.wait_till_total_numbers_match(master, bucket.name, timeout_in_seconds=timeout)
         test.assertTrue(verified, "Lost items!!.. failing test in {0} secs".format(timeout))
Ejemplo n.º 21
0
 def setUp(self):
     super(XDCRTests, self).setUp()
     self.bucket = Bucket()
     self._initialize_nodes()
     self.master = self.servers[0]
     for server in self.servers:
         rest=RestConnection(server)
         cluster_status = rest.cluster_status()
         self.log.info("Initial status of {0} cluster is {1}".format(server.ip,
                                                                     cluster_status['nodes'][0]['status']))
         while cluster_status['nodes'][0]['status'] == 'warmup':
             self.log.info("Waiting for cluster to become healthy")
             self.sleep(5)
             cluster_status = rest.cluster_status()
         self.log.info("current status of {0}  is {1}".format(server.ip,
                                                              cluster_status['nodes'][0]['status']))
     # Delete all buckets before creating new buckets
     self.log.info("Deleting all existing buckets")
     BucketOperationHelper.delete_all_buckets_or_assert(self.servers, self)
     self.log.info("Creating new buckets")
     src_bucket = self.input.param('src_bucket', self.bucket)
     dest_bucket = self.input.param('dest_bucket', self.bucket)
     if src_bucket:
         RestConnection(self.servers[0]).create_bucket(bucket='default', ramQuotaMB=500)
     if dest_bucket:
         RestConnection(self.servers[1]).create_bucket(bucket='default', ramQuotaMB=500)
     helper = BaseHelper(self)
     helper.login()
Ejemplo n.º 22
0
    def verifyLdapSettings(self, server, admins, ro_admins, default, enabled):
        rest = RestConnection(server)
        settings = rest.ldapRestOperationGetResponse()

        if admins is None:
            admins = []
        else:
            admins = admins.split(",")

        if ro_admins is None:
            ro_admins = []
        else:
            ro_admins = ro_admins.split(",")

        if str(enabled) == "0":
            admins = []
            ro_admins = []

        if default == "admins" and str(enabled) == "1":
            if settings["admins"] != "asterisk":
                log.info("Admins don't match (%s vs asterisk)", settings["admins"])
                return False
        elif not self._list_compare(settings["admins"], admins):
            log.info("Admins don't match (%s vs %s)", settings["admins"], admins)
            return False

        if default == "roadmins" and str(enabled) == "1":
            if settings["roAdmins"] != "asterisk":
                log.info("Read only admins don't match (%s vs asterisk)", settings["roAdmins"])
                return False
        elif not self._list_compare(settings["roAdmins"], ro_admins):
            log.info("Read only admins don't match (%s vs %s)", settings["roAdmins"], ro_admins)
            return False

        return True
Ejemplo n.º 23
0
    def verifyIndexSettings(self, server, max_rollbacks, stable_snap_interval, mem_snap_interval,
                                                     storage_mode, threads, log_level):
        rest = RestConnection(server)
        settings = rest.get_global_index_settings()

        if storage_mode == "default":
            storage_mode = "forestdb"
        elif storage_mode == "memopt":
            storage_mode = "memory_optimized"

        if max_rollbacks and str(settings["maxRollbackPoints"]) != str(max_rollbacks):
            log.info("Max rollbacks does not match (%s vs. %s)", str(settings["maxRollbackPoints"]), str(max_rollbacks))
            return False
        if stable_snap_interval and str(settings["stableSnapshotInterval"]) != str(stable_snap_interval):
            log.info("Stable snapshot interval does not match (%s vs. %s)", str(settings["stableSnapshotInterval"]),
                     str(stable_snap_interval))
            return False
        if mem_snap_interval and str(settings["memorySnapshotInterval"]) != str(mem_snap_interval):
            log.info("Memory snapshot interval does not match (%s vs. %s)", str(settings["memorySnapshotInterval"]),
                     str(mem_snap_interval))
            return False
        if storage_mode and str(settings["storageMode"]) != str(storage_mode):
            log.info("Storage mode does not match (%s vs. %s)", str(settings["storageMode"]), str(storage_mode))
            return False
        if threads and str(settings["indexerThreads"]) != str(threads):
            log.info("Threads does not match (%s vs. %s)", str(settings["indexerThreads"]), str(threads))
            return False
        if log_level and str(settings["logLevel"]) != str(log_level):
            log.info("Log level does not match (%s vs. %s)", str(settings["logLevel"]), str(log_level))
            return False

        return True
Ejemplo n.º 24
0
 def common_test_body(self, replica, load_ratio, timeout=10):
     log = logger.Logger.get_logger()
     start_time = time.time()
     log.info("replica : {0}".format(replica))
     log.info("load_ratio : {0}".format(load_ratio))
     master = self._servers[0]
     log.info('picking server : {0} as the master'.format(master))
     rest = RestConnection(master)
     while time.time() < ( start_time + 60 * timeout):
         #rebalance out step nodes
         #let's add some items ?
         nodes = rest.node_statuses()
         delta = len(self._servers) - len(nodes)
         if delta > 0:
             if delta > 1:
                 how_many_add = Random().randint(1, delta)
             else:
                 how_many_add = 1
             self.log.info("going to add {0} nodes".format(how_many_add))
             self.rebalance_in(how_many=how_many_add)
         else:
             self.log.info("all nodes already joined the cluster")
         time.sleep(30 * 60)
         #dont rebalance out if there are not too many nodes
         if len(nodes) >= (3.0 / 4.0 * len(self._servers)):
             nodes = rest.node_statuses()
             how_many_out = Random().randint(1, len(nodes) - 1)
             self.log.info("going to remove {0} nodes".format(how_many_out))
             self.rebalance_out(how_many=how_many_out)
Ejemplo n.º 25
0
    def verifyServices(self, server, expected_services):
        """Verifies that the services on a given node match the expected service

            Options:
            server - A TestInputServer object of the server to connect to
            expected_services - A comma separated list of services

            Returns a boolean corresponding to whether or not the expected services are available on the server.
        """
        rest = RestConnection(server)
        hostname = "%s:%s" % (server.ip, server.port)
        expected_services = expected_services.replace("data", "kv")
        expected_services = expected_services.replace("query", "n1ql")
        expected_services = expected_services.split(",")

        nodes_services = rest.get_nodes_services()
        for node, services in nodes_services.iteritems():
            if node.encode('ascii') == hostname:
                if len(services) != len(expected_services):
                    log.info("Services on %s do not match expected services (%s vs. %s)",
                             hostname, services, expected_services)
                    return False
                for service in services:
                    if service.encode("ascii") not in expected_services:
                        log.info("Services on %s do not match expected services (%s vs. %s)",
                                 hostname, services, expected_services)
                        return False
                return True

        log.info("Services on %s not found, the server may not exist", hostname)
        return False
Ejemplo n.º 26
0
    def create_bucket(serverInfo, name='default', replica=1, port=11210, test_case=None, bucket_ram=-1, password=None):
        log = logger.Logger.get_logger()
        rest = RestConnection(serverInfo)
        if bucket_ram < 0:
            info = rest.get_nodes_self()
            bucket_ram = info.memoryQuota * 2 / 3

        if password == None:
            authType = "sasl"
        else:
            authType = "none"

        rest.create_bucket(bucket=name,
                           ramQuotaMB=bucket_ram,
                           replicaNumber=replica,
                           proxyPort=port,
                           authType=authType,
                           saslPassword=password)
        msg = 'create_bucket succeeded but bucket "{0}" does not exist'
        bucket_created = BucketOperationHelper.wait_for_bucket_creation(name, rest)
        if not bucket_created:
            log.error(msg)
            if test_case:
                test_case.fail(msg=msg.format(name))
        return bucket_created
Ejemplo n.º 27
0
    def backup(self):
        while True:
            try:
                x = self.queue.get_nowait()
                self.log.info("get_nowait : {0}".format(x))

                break
                #things are notmal just do another back aafter
                #waiting for self.interval
            except Exception:
                master = self.servers[0]
                rest = RestConnection(master)
                nodes = rest.node_statuses()
                map = self.node_server_map(nodes, self.servers)
                self.log.info("cluster has {0} nodes".format(len(nodes)))
                for node in nodes:
                    try:
                        from Crypto.Random import atfork
                        atfork()
                        BackupHelper(map[node]).backup('default', "/tmp")
                        BackupHelper(map[node]).backup('default', "/tmp")
                    except Exception as ex:
                        print ex
                    self.log.info("backed up the data into ")
                time.sleep(self.interval)
Ejemplo n.º 28
0
    def verifyRecoveryType(self, server, recovery_servers, recovery_type):
        rest = RestConnection(server)
        settings = rest.get_all_zones_info()
        if not settings or "groups" not in settings:
            log.info("Group settings payload appears to be invalid")
            return False

        if not recovery_servers:
            return True

        num_found = 0
        recovery_servers = recovery_servers.split(",")
        for group in settings["groups"]:
            for node in group["nodes"]:
                for rs in recovery_servers:
                    if node["hostname"] == rs:
                        if node["recoveryType"] != recovery_type:
                            log.info("Node %s doesn't contain recovery type %s ", rs, recovery_type)
                            return False
                        else:
                            num_found = num_found + 1

        if num_found == len(recovery_servers):
            return True

        log.info("Node `%s` not found in nodes list", ",".join(recovery_servers))
        return False
Ejemplo n.º 29
0
 def create_primary_index_for_3_0_and_greater(self):
     self.log.info("CREATE PRIMARY INDEX using %s" % self.primary_indx_type)
     rest = RestConnection(self.master)
     versions = rest.get_nodes_versions()
     if versions[0].startswith("4") or versions[0].startswith("3") or versions[0].startswith("5"):
         for bucket in self.buckets:
             if self.primary_indx_drop:
                 self.log.info("Dropping primary index for %s using %s ..." % (bucket.name,self.primary_indx_type))
                 self.query = "DROP PRIMARY INDEX ON %s USING %s" % (bucket.name,self.primary_indx_type)
                 #self.run_cbq_query()
                 self.sleep(3, 'Sleep for some time after index drop')
             self.query = 'select * from system:indexes where name="#primary" and keyspace_id = "%s"' % bucket.name
             res = self.run_cbq_query()
             self.sleep(10)
             if self.monitoring:
                 self.query = "delete from system:completed_requests"
                 self.run_cbq_query()
             if not self.skip_primary_index:
                 if (res['metrics']['resultCount'] == 0):
                     self.query = "CREATE PRIMARY INDEX ON %s USING %s" % (bucket.name, self.primary_indx_type)
                     self.log.info("Creating primary index for %s ..." % bucket.name)
                     try:
                         self.run_cbq_query()
                         self.primary_index_created = True
                         if self.primary_indx_type.lower() == 'gsi':
                             self._wait_for_index_online(bucket, '#primary')
                     except Exception, ex:
                         self.log.info(str(ex))
Ejemplo n.º 30
0
 def setUp(self):
     super(RbacTestMemcached, self).setUp()
     rest = RestConnection(self.master)
     self.auth_type = self.input.param('auth_type','builtin')
     self.user_id = self.input.param("user_id",None)
     self.user_role = self.input.param("user_role",None)
     self.bucket_name = self.input.param("bucket_name",None)
     rest.create_bucket(bucket=self.bucket_name, ramQuotaMB=100,lww=True)
     self.role_map = self.input.param("role_map",None)
     self.incorrect_bucket = self.input.param("incorrect_bucket",False)
     self.new_role = self.input.param("new_role",None)
     self.new_role_map = self.input.param("new_role_map",None)
     self.no_bucket_access = self.input.param("no_bucket_access",False)
     self.no_access_bucket_name = self.input.param("no_access_bucket_name","noaccess")
     self.all_buckets = self.input.param("all_buckets",None)
     self.ldap_users = rbacmain().returnUserList(self.user_id)
     if self.no_bucket_access:
         rest.create_bucket(bucket=self.no_access_bucket_name, ramQuotaMB=100, lww=True)
     if self.auth_type == 'ldap':
         rbacmain(self.master, 'builtin')._delete_user('cbadminbucket')
     if self.auth_type == 'ldap':
         rbacmain().setup_auth_mechanism(self.servers,'ldap',rest)
         for user in self.ldap_users:
             testuser = [{'id': user[0], 'name': user[0], 'password': user[1]}]
             RbacBase().create_user_source(testuser, 'ldap', self.master)
             self.sleep(10)
     elif self.auth_type == "pam":
         rbacmain().setup_auth_mechanism(self.servers,'pam', rest)
         rbacmain().add_remove_local_user(self.servers, self.ldap_users, 'deluser')
         rbacmain().add_remove_local_user(self.servers, self.ldap_users,'adduser')
     elif self.auth_type == "builtin":
         for user in self.ldap_users:
             testuser = [{'id': user[0], 'name': user[0], 'password': user[1]}]
             RbacBase().create_user_source(testuser, 'builtin', self.master)
             self.sleep(10)
    def wait_for_vbuckets_ready_state(node,
                                      bucket,
                                      timeout_in_seconds=300,
                                      log_msg='',
                                      admin_user='******',
                                      admin_pass='******'):
        log = logger.Logger.get_logger()
        start_time = time.time()
        end_time = start_time + timeout_in_seconds
        ready_vbuckets = {}
        rest = RestConnection(node)
        servers = rest.get_nodes()
        RestHelper(rest).vbucket_map_ready(bucket, 60)
        vbucket_count = len(rest.get_vbuckets(bucket))
        vbuckets = rest.get_vbuckets(bucket)
        obj = VBucketAwareMemcached(rest, bucket)
        memcacheds, vbucket_map, vbucket_map_replica = obj.request_map(
            rest, bucket)
        #Create dictionary with key:"ip:port" and value: a list of vbuckets
        server_dict = defaultdict(list)
        for everyID in range(0, vbucket_count):
            memcached_ip_port = str(vbucket_map[everyID])
            server_dict[memcached_ip_port].append(everyID)
        while time.time() < end_time and len(ready_vbuckets) < vbucket_count:
            for every_ip_port in server_dict:
                #Retrieve memcached ip and port
                ip = every_ip_port.rsplit(":", 1)[0]
                port = every_ip_port.rsplit(":", 1)[1]
                client = MemcachedClient(ip, int(port), timeout=30)
                client.vbucket_count = len(vbuckets)
                bucket_info = rest.get_bucket(bucket)
                cluster_compatibility = rest.check_cluster_compatibility("5.0")
                if cluster_compatibility is None:
                    pre_spock = True
                else:
                    pre_spock = not cluster_compatibility
                if pre_spock:
                    log.info("Atleast 1 of the server is on pre-spock "
                             "version. Using the old ssl auth to connect to "
                             "bucket.")
                    client.sasl_auth_plain(
                        bucket_info.name.encode('ascii'),
                        bucket_info.saslPassword.encode('ascii'))
                else:
                    client.sasl_auth_plain(admin_user, admin_pass)
                    bucket = bucket.encode('ascii')
                    client.bucket_select(bucket)
                for i in server_dict[every_ip_port]:
                    try:
                        (a, b, c) = client.get_vbucket_state(i)
                    except mc_bin_client.MemcachedError as e:
                        ex_msg = str(e)
                        if "Not my vbucket" in log_msg:
                            log_msg = log_msg[:log_msg.find("vBucketMap") +
                                              12] + "..."
                        if e.status == memcacheConstants.ERR_NOT_MY_VBUCKET:
                            # May receive this while waiting for vbuckets, continue and retry...S
                            continue
                        log.error("%s: %s" % (log_msg, ex_msg))
                        continue
                    except exceptions.EOFError:
                        # The client was disconnected for some reason. This can
                        # happen just after the bucket REST API is returned (before
                        # the buckets are created in each of the memcached processes.)
                        # See here for some details: http://review.couchbase.org/#/c/49781/
                        # Longer term when we don't disconnect clients in this state we
                        # should probably remove this code.
                        log.error(
                            "got disconnected from the server, reconnecting")
                        client.reconnect()
                        client.sasl_auth_plain(
                            bucket_info.name.encode('ascii'),
                            bucket_info.saslPassword.encode('ascii'))
                        continue

                    if c.find("\x01") > 0 or c.find("\x02") > 0:
                        ready_vbuckets[i] = True
                    elif i in ready_vbuckets:
                        log.warning(
                            "vbucket state changed from active to {0}".format(
                                c))
                        del ready_vbuckets[i]
                client.close()
        return len(ready_vbuckets) == vbucket_count
Ejemplo n.º 32
0
 def _log_start(self):
     try:
         msg = "{0} : {1} started ".format(datetime.datetime.now(), self._testMethodName)
         RestConnection(self.servers[0]).log_client_error(msg)
     except:
         pass
Ejemplo n.º 33
0
 def getOtpNodeIds(master):
     rest = RestConnection(master)
     nodes = rest.node_statuses()
     otpNodeIds = [node.id for node in nodes]
     return otpNodeIds
Ejemplo n.º 34
0
    def rebalance_in(servers,
                     how_many,
                     do_shuffle=True,
                     monitor=True,
                     do_check=True):
        servers_rebalanced = []
        log = logger.Logger.get_logger()
        rest = RestConnection(servers[0])
        nodes = rest.node_statuses()
        #are all ips the same
        nodes_on_same_ip = True
        firstIp = nodes[0].ip
        if len(nodes) == 1:
            nodes_on_same_ip = False
        else:
            for node in nodes:
                if node.ip != firstIp:
                    nodes_on_same_ip = False
                    break
        nodeIps = ["{0}:{1}".format(node.ip, node.port) for node in nodes]
        log.info("current nodes : {0}".format(nodeIps))
        toBeAdded = []
        master = servers[0]
        selection = servers[1:]
        if do_shuffle:
            shuffle(selection)
        for server in selection:
            if nodes_on_same_ip:
                if not "{0}:{1}".format(firstIp, server.port) in nodeIps:
                    toBeAdded.append(server)
                    servers_rebalanced.append(server)
                    log.info("choosing {0}:{1}".format(server.ip, server.port))
            elif not "{0}:{1}".format(server.ip, server.port) in nodeIps:
                toBeAdded.append(server)
                servers_rebalanced.append(server)
                log.info("choosing {0}:{1}".format(server.ip, server.port))
            if len(toBeAdded) == int(how_many):
                break

        if do_check and len(toBeAdded) < how_many:
            raise Exception(
                "unable to find {0} nodes to rebalance_in".format(how_many))

        for server in toBeAdded:
            otpNode = rest.add_node(master.rest_username, master.rest_password,
                                    server.ip, server.port)
        otpNodes = [node.id for node in rest.node_statuses()]
        started = rest.rebalance(otpNodes, [])
        msg = "rebalance operation started ? {0}"
        log.info(msg.format(started))
        if monitor is not True:
            return True, servers_rebalanced
        if started:
            try:
                result = rest.monitorRebalance()
            except RebalanceFailedException as e:
                log.error("rebalance failed: {0}".format(e))
                return False, servers_rebalanced
            msg = "successfully rebalanced in selected nodes from the cluster ? {0}"
            log.info(msg.format(result))
            return result, servers_rebalanced
        return False, servers_rebalanced
Ejemplo n.º 35
0
    def verify_items_count(master, bucket, num_attempt=3, timeout=2):
        #get the #of buckets from rest
        rest = RestConnection(master)
        if isinstance(bucket, Bucket):
            bucket = bucket.name
        bucket_info = rest.get_bucket(bucket, num_attempt, timeout)
        replica_factor = bucket_info.numReplicas
        vbucket_active_sum = 0
        vbucket_replica_sum = 0
        vbucket_pending_sum = 0
        all_server_stats = []
        stats_received = True
        nodes = rest.get_nodes()
        for server in nodes:
            #get the stats
            server_stats = rest.get_bucket_stats_for_node(bucket, server)
            if not server_stats:
                log.info("unable to get stats from {0}:{1}".format(
                    server.ip, server.port))
                stats_received = False
            all_server_stats.append((server, server_stats))
        if not stats_received:
            raise StatsUnavailableException()
        sum = 0
        for server, single_stats in all_server_stats:
            if not single_stats or "curr_items" not in single_stats:
                continue
            sum += single_stats["curr_items"]
            log.info("curr_items from {0}:{1} : {2}".format(server.ip, server.port, \
                single_stats["curr_items"]))
            if 'vb_pending_num' in single_stats:
                vbucket_pending_sum += single_stats['vb_pending_num']
                log.info(
                    "vb_pending_num from {0}:{1} : {2}".format(server.ip, server.port, \
                        single_stats["vb_pending_num"]))
            if 'vb_active_num' in single_stats:
                vbucket_active_sum += single_stats['vb_active_num']
                log.info(
                    "vb_active_num from {0}:{1} : {2}".format(server.ip, server.port, \
                        single_stats["vb_active_num"]))
            if 'vb_replica_num' in single_stats:
                vbucket_replica_sum += single_stats['vb_replica_num']
                log.info(
                    "vb_replica_num from {0}:{1} : {2}".format(server.ip, server.port, \
                        single_stats["vb_replica_num"]))

        msg = "summation of vb_active_num : {0} vb_pending_num : {1} vb_replica_num : {2}"
        log.info(
            msg.format(vbucket_active_sum, vbucket_pending_sum,
                       vbucket_replica_sum))
        msg = 'sum : {0} and sum * (replica_factor + 1) ({1}) : {2}'
        log.info(
            msg.format(sum, replica_factor + 1, (sum * (replica_factor + 1))))
        master_stats = rest.get_bucket_stats(bucket)
        if "curr_items_tot" in master_stats:
            log.info('curr_items_tot from master: {0}'.format(
                master_stats["curr_items_tot"]))
        else:
            raise Exception(
                "bucket {O} stats doesnt contain 'curr_items_tot':".format(
                    bucket))
        if replica_factor >= len(nodes):
            log.warn("the number of nodes is less than replica requires")
            delta = sum * (len(nodes)) - master_stats["curr_items_tot"]
        else:
            delta = sum * (replica_factor + 1) - master_stats["curr_items_tot"]
        delta = abs(delta)

        if delta > 0:
            if sum == 0:
                missing_percentage = 0
            else:
                missing_percentage = delta * 1.0 / (sum * (replica_factor + 1))
            log.info("Nodes stats are: {0}".format([node.ip
                                                    for node in nodes]))
        else:
            missing_percentage = 1
        log.info("delta : {0} missing_percentage : {1} replica_factor : {2}".format(delta, \
            missing_percentage, replica_factor))
        # If no items missing then, return True
        if not delta:
            return True
        return False
Ejemplo n.º 36
0
    def wait_for_mc_stats_all_nodes(master,
                                    bucket,
                                    stat_key,
                                    stat_value,
                                    timeout_in_seconds=120,
                                    verbose=True):
        log.info("waiting for bucket {0} stat : {1} to match {2} on {3}".format(bucket, stat_key, \
                                                                                stat_value, master.ip))
        time_to_timeout = 0
        previous_stat_value = -1
        curr_stat_value = -1
        verified = False
        all_stats = {}
        while not verified:
            rest = RestConnection(master)
            nodes = rest.node_statuses()
            for node in nodes:
                _server = {
                    "ip": node.ip,
                    "port": node.port,
                    "username": master.rest_username,
                    "password": master.rest_password
                }
                #failed over node is part of node_statuses but since its failed over memcached connections
                #to this node will fail
                node_self = RestConnection(_server).get_nodes_self()
                if node_self.clusterMembership == 'active':
                    mc = MemcachedClientHelper.direct_client(_server, bucket)
                    n_stats = mc.stats("")
                    mc.close()
                    all_stats[node.id] = n_stats
            actual_stat_value = -1
            for k in all_stats:
                if all_stats[k] and stat_key in all_stats[k]:
                    if actual_stat_value == -1:
                        log.info(all_stats[k][stat_key])
                        actual_stat_value = int(all_stats[k][stat_key])
                    else:
                        actual_stat_value += int(all_stats[k][stat_key])
            if actual_stat_value == stat_value:
                log.info("{0} : {1}".format(stat_key, actual_stat_value))
                verified = True
                break
            else:
                if verbose:
                    log.info("{0} : {1}".format(stat_key, actual_stat_value))
                curr_stat_value = actual_stat_value

                # values are changing so clear any timeout
                if curr_stat_value != previous_stat_value:
                    time_to_timeout = 0
                else:
                    if time_to_timeout == 0:
                        time_to_timeout = time.time() + timeout_in_seconds
                    if time_to_timeout < time.time():
                        log.info(
                            "no change in {0} stat after {1} seconds (value = {2})"
                            .format(stat_key, timeout_in_seconds,
                                    curr_stat_value))
                        break

                previous_stat_value = curr_stat_value

                if not verbose:
                    time.sleep(0.1)
                else:
                    time.sleep(2)
        return verified
Ejemplo n.º 37
0
    def test_rollback(self):
        bucket = self.src_cluster.get_buckets()[0]
        src_nodes = self.src_cluster.get_nodes()
        dest_nodes = self.dest_cluster.get_nodes()
        nodes = src_nodes + dest_nodes

        # Stop Persistence on Node A & Node B
        for node in nodes:
            mem_client = MemcachedClientHelper.direct_client(node, bucket)
            mem_client.stop_persistence()

        goxdcr_log = NodeHelper.get_goxdcr_log_dir(self._input.servers[0])\
                     + '/goxdcr.log*'
        self.setup_xdcr()

        self.src_cluster.pause_all_replications()
        self.dest_cluster.pause_all_replications()

        gen = BlobGenerator("C1-",
                            "C1-",
                            self._value_size,
                            end=self._num_items)
        self.src_cluster.load_all_buckets_from_generator(gen)
        gen = BlobGenerator("C2-",
                            "C2-",
                            self._value_size,
                            end=self._num_items)
        self.dest_cluster.load_all_buckets_from_generator(gen)

        self.src_cluster.resume_all_replications()
        self.dest_cluster.resume_all_replications()

        # Perform mutations on the bucket
        self.async_perform_update_delete()

        rest1 = RestConnection(self.src_cluster.get_master_node())
        rest2 = RestConnection(self.dest_cluster.get_master_node())

        # Fetch count of docs in src and dest cluster
        _count1 = rest1.fetch_bucket_stats(
            bucket=bucket.name)["op"]["samples"]["curr_items"][-1]
        _count2 = rest2.fetch_bucket_stats(
            bucket=bucket.name)["op"]["samples"]["curr_items"][-1]

        self.log.info(
            "Before rollback src cluster count = {0} dest cluster count = {1}".
            format(_count1, _count2))

        # Kill memcached on Node A so that Node B becomes master
        shell = RemoteMachineShellConnection(
            self.src_cluster.get_master_node())
        shell.kill_memcached()
        shell = RemoteMachineShellConnection(
            self.dest_cluster.get_master_node())
        shell.kill_memcached()

        # Start persistence on Node B
        mem_client = MemcachedClientHelper.direct_client(src_nodes[1], bucket)
        mem_client.start_persistence()
        mem_client = MemcachedClientHelper.direct_client(dest_nodes[1], bucket)
        mem_client.start_persistence()

        # Failover Node B
        failover_task = self.src_cluster.async_failover()
        failover_task.result()
        failover_task = self.dest_cluster.async_failover()
        failover_task.result()

        # Wait for Failover & rollback to complete
        self.sleep(60)

        # Fetch count of docs in src and dest cluster
        _count1 = rest1.fetch_bucket_stats(
            bucket=bucket.name)["op"]["samples"]["curr_items"][-1]
        _count2 = rest2.fetch_bucket_stats(
            bucket=bucket.name)["op"]["samples"]["curr_items"][-1]

        self.log.info(
            "After rollback src cluster count = {0} dest cluster count = {1}".
            format(_count1, _count2))

        self.assertTrue(
            self.src_cluster.wait_for_outbound_mutations(),
            "Mutations in source cluster not replicated to target after rollback"
        )
        self.assertTrue(
            self.dest_cluster.wait_for_outbound_mutations(),
            "Mutations in target cluster not replicated to source after rollback"
        )

        count = NodeHelper.check_goxdcr_log(
            src_nodes[0], "Received rollback from DCP stream", goxdcr_log)
        self.assertGreater(count, 0, "rollback did not happen as expected")
        self.log.info("rollback happened as expected")

        count = NodeHelper.check_goxdcr_log(
            dest_nodes[0], "Received rollback from DCP stream", goxdcr_log)
        self.assertGreater(count, 0, "rollback did not happen as expected")
        self.log.info("rollback happened as expected")
Ejemplo n.º 38
0
def xdcr_link_cluster(src_master, dest_master, dest_cluster_name):
    rest_conn_src = RestConnection(src_master)
    rest_conn_src.add_remote_cluster(dest_master.ip, dest_master.port,
                                 dest_master.rest_username,
                                 dest_master.rest_password, dest_cluster_name)
Ejemplo n.º 39
0
class FailoverTests(FailoverBaseTest):
    def setUp(self):
        super(FailoverTests, self).setUp()
        self.server_map = self.get_server_map()

    def tearDown(self):
        super(FailoverTests, self).tearDown()

    def test_failover_firewall(self):
        self.common_test_body('firewall')

    def test_failover_normal(self):
        self.common_test_body('normal')

    def test_failover_stop_server(self):
        self.common_test_body('stop_server')

    def test_failover_then_add_back(self):
        self.add_back_flag = True
        self.common_test_body('normal')

    def test_failover_then_add_back_and_rebalance_in(self):
        self.add_back_flag = True
        self.common_test_body('normal', rebalance_type="in")

    def test_failover_then_add_back_and_rebalance_out(self):
        self.add_back_flag = True
        self.common_test_body('normal', rebalance_type="out")

    def test_failover_then_add_back_and_swap_rebalance(self):
        self.add_back_flag = True
        self.common_test_body('normal', rebalance_type="swap")

    def common_test_body(self, failover_reason, rebalance_type=None):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case(before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARD/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta)
                with rebalance
            5. Verify all expected operations completed by checking
               stats, replicaiton, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.filter_list = []
        if self.failoverMaster:
            self.master = self.cluster.servers[1]
        else:
            self.master = self.cluster.master
        self.log.info(
            " Picking node {0} as reference node for test case".format(
                self.master.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.master)
        self.nodes = self.rest.node_statuses()
        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Variable to decide the durability outcome
        durability_will_fail = False
        # Variable to track the number of nodes failed
        num_nodes_failed = 1

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 \
                and (self.graceful or self.recoveryType is not None):
            self.log.error(
                "Can't apply graceful failover to nodes with version < 3.*")
            self.log.error("Please check configuration params: SKIPPING TEST")
            return

        # Find nodes that will under go failover
        if self.failoverMaster:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=1, target_node=self.servers[0])
        else:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withMutationOps = True => Run Operations in parallel to failover
        # self.withMutationOps = False => Run Operations Before failover
        self.load_initial_data()
        if not self.withMutationOps:
            self.run_mutation_operations()
        # Perform View Creation Tasks and
        # check for completion if required before failover
        if self.withViewsOps:
            self.run_view_creation_operations(self.servers)
            if not self.createIndexesDuringFailover:
                self.query_and_monitor_view_tasks(self.servers)

        # Take snap-shot of data set used for validation
        record_static_data_set = {}
        prev_vbucket_stats = {}
        prev_failover_stats = {}
        if not self.withMutationOps:
            record_static_data_set = self.bucket_util.get_data_set_all(
                self.cluster.servers, self.buckets, path=None)

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(
                self.servers, self.buckets)
            prev_failover_stats = self.bucket_util.get_failovers_logs(
                self.servers, self.buckets)

        # Perform Operations related to failover
        if self.withMutationOps or self.withViewsOps or self.compact:
            self.run_failover_operations_with_ops(self.chosen, failover_reason)
        else:
            self.run_failover_operations(self.chosen, failover_reason)

        target_bucket = self.bucket_util.buckets[0]

        # Update new_replica value, if provided in the conf
        if self.new_replica:
            self.num_replicas = self.new_replica
            bucket_helper = BucketHelper(self.master)
            bucket_helper.change_bucket_props(target_bucket.name,
                                              replicaNumber=self.num_replicas)

        # Decide whether the durability is going to fail or not
        if self.num_failed_nodes >= 1 and self.num_replicas > 1:
            durability_will_fail = True

        # Construct target vbucket list from the nodes
        # which are going to be failed over
        vbucket_list = list()
        for target_node in self.chosen:
            for server in self.servers:
                if server.ip == target_node.ip:
                    # Comment out the break once vbucket_list method is fixed
                    break
                    shell_conn = RemoteMachineShellConnection(server)
                    cb_stats = Cbstats(shell_conn)
                    vbuckets = cb_stats.vbucket_list(target_bucket.name,
                                                     self.target_vbucket_type)
                    shell_conn.disconnect()
                    vbucket_list += vbuckets

        # Code to generate doc_loaders that will work on vbucket_type
        # based on targeted nodes. This will perform CRUD only on
        # vbuckets which will be affected by the failover
        self.gen_create = doc_generator(self.key,
                                        self.num_items,
                                        self.num_items * 1.5,
                                        target_vbucket=vbucket_list)
        self.gen_update = doc_generator(self.key,
                                        self.num_items / 2,
                                        self.num_items,
                                        target_vbucket=vbucket_list)
        self.gen_delete = doc_generator(self.key,
                                        self.num_items / 4,
                                        self.num_items / 2 - 1,
                                        target_vbucket=vbucket_list)
        self.afterfailover_gen_create = doc_generator(
            self.key,
            self.num_items * 1.6,
            self.num_items * 2,
            target_vbucket=vbucket_list)
        self.afterfailover_gen_update = doc_generator(
            self.key, 1, self.num_items / 4, target_vbucket=vbucket_list)
        self.afterfailover_gen_delete = doc_generator(
            self.key,
            self.num_items * 0.5,
            self.num_items * 0.75,
            target_vbucket=vbucket_list)

        # Perform Add Back Operation with Rebalance
        # or only Rebalance with verifications
        if not self.gracefulFailoverFail and self.runRebalanceAfterFailover:
            if self.failover_onebyone:
                # Reset it back to False
                durability_will_fail = False
                for node_chosen in self.chosen:
                    if num_nodes_failed > 1:
                        durability_will_fail = True

                    if self.add_back_flag:
                        # In add-back case, durability should never fail, since
                        # the num_nodes in the cluster will remain the same
                        self.run_add_back_operation_and_verify(
                            [node_chosen],
                            prev_vbucket_stats,
                            record_static_data_set,
                            prev_failover_stats,
                            rebalance_type=rebalance_type)
                    else:
                        self.run_rebalance_after_failover_and_verify(
                            [node_chosen],
                            prev_vbucket_stats,
                            record_static_data_set,
                            prev_failover_stats,
                            durability_will_fail=durability_will_fail)
                    num_nodes_failed += 1
            else:
                if self.add_back_flag:
                    self.run_add_back_operation_and_verify(
                        self.chosen,
                        prev_vbucket_stats,
                        record_static_data_set,
                        prev_failover_stats,
                        durability_will_fail=durability_will_fail,
                        rebalance_type=rebalance_type)
                else:
                    self.run_rebalance_after_failover_and_verify(
                        self.chosen,
                        prev_vbucket_stats,
                        record_static_data_set,
                        prev_failover_stats,
                        durability_will_fail=durability_will_fail)
        else:
            return

        # Will verify_unacked_bytes only if the durability is not going to fail
        if self.during_ops is None and not durability_will_fail:
            self.bucket_util.verify_unacked_bytes_all_buckets(
                filter_list=self.filter_list)

    def run_rebalance_after_failover_and_verify(self,
                                                chosen,
                                                prev_vbucket_stats,
                                                record_static_data_set,
                                                prev_failover_stats,
                                                durability_will_fail=False):
        """ Method to run rebalance after failover and verify """
        # Need a delay > min because MB-7168
        _servers_ = self.filter_servers(self.servers, chosen)
        self.bucket_util._wait_for_stats_all_buckets(
            check_ep_items_remaining=True)
        self.sleep(5, "after failover before invoking rebalance...")
        # Rebalance after Failover operation
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[node.id for node in chosen])
        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master, bucket)
        # Perform View Validation if Supported
        nodes = self.filter_servers(self.servers, chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node=chosen[0])
            self.sleep(60)
            self.log.info(" Start Rebalance Again !")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                                ejectedNodes=[node.id for node in chosen])

        # Rebalance Monitoring
        msg = "rebalance failed while removing failover nodes {0}" \
              .format([node.id for node in chosen])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
        #  Drain Queue and make sure intra-cluster replication is complete
        self.log.info("Begin VERIFICATION for Rebalance after Failover Only")
        self.bucket_util.verify_cluster_stats(self.num_items,
                                              self.master,
                                              check_bucket_stats=True,
                                              check_ep_items_remaining=True)
        # Verify all data set with meta data if failover happens after failover
        if not self.withMutationOps:
            self.sleep(60)
            self.bucket_util.data_analysis_all(record_static_data_set,
                                               _servers_,
                                               self.buckets,
                                               path=None,
                                               addedItems=None)

        # Check Cluster Stats and Data as well if max_verify > 0
        # Check Failover logs :: Not sure about this logic,
        # currently not checking, will update code once confirmed
        # Currently, only  for checking case where we  have graceful failover
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_failover_stats = self.bucket_util.compare_failovers_logs(
                prev_failover_stats, _servers_, self.buckets)
            new_vbucket_stats = self.bucket_util.compare_vbucket_seqnos(
                prev_vbucket_stats, _servers_, self.buckets)
            self.bucket_util.compare_vbucketseq_failoverlogs(
                new_vbucket_stats, new_failover_stats)
        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master)
            self.bucket_util.vb_distribution_analysis(
                servers=nodes,
                buckets=self.buckets,
                num_replicas=self.num_replicas,
                total_vbuckets=self.total_vbuckets,
                std=20.0)
        self.log.info("End VERIFICATION for Rebalance after Failover Only")

    def run_add_back_operation_and_verify(self,
                                          chosen,
                                          prev_vbucket_stats,
                                          record_static_data_set,
                                          prev_failover_stats,
                                          durability_will_fail=False,
                                          rebalance_type=None):
        """
        Method to run add-back operation with recovery type = (delta/full).
        It also verifies if the operations are correct with data
        verificaiton steps
        """
        _servers_ = self.filter_servers(self.servers, chosen)
        self.bucket_util._wait_for_stats_all_buckets(
            check_ep_items_remaining=True)
        recoveryTypeMap = self.define_maps_during_failover(self.recoveryType)
        fileMapsForVerification = self.create_file(chosen, self.buckets,
                                                   self.server_map)
        index = 0
        for node in chosen:
            self.rest.add_back_node(node.id)
            self.sleep(5)
            if self.recoveryType:
                # define precondition for recovery type
                self.rest.set_recovery_type(
                    otpNode=node.id, recoveryType=self.recoveryType[index])
                index += 1
        self.sleep(20, "After failover before invoking rebalance...")
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[],
                            deltaRecoveryBuckets=self.deltaRecoveryBuckets)

        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master, bucket)

        # Perform View Validation if Supported
        nodes = self.filter_servers(self.servers, chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node=chosen[0])
            self.sleep(60)
            self.log.info(" Start Rebalance Again !")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                                ejectedNodes=[],
                                deltaRecoveryBuckets=self.deltaRecoveryBuckets)

        if rebalance_type == "in":
            rebalance = self.task.rebalance(self.servers[:self.nodes_init],
                                            [self.servers[self.nodes_init]],
                                            [])
        if rebalance_type == "out":
            rebalance = self.task.rebalance(
                self.servers[:self.nodes_init], [],
                [self.servers[self.nodes_init - 1]])
        if rebalance_type == "swap":
            self.rest.add_node(self.master.rest_username,
                               self.master.rest_password,
                               self.servers[self.nodes_init].ip,
                               self.servers[self.nodes_init].port,
                               services=["kv"])
            rebalance = self.task.rebalance(
                self.servers[:self.nodes_init], [],
                [self.servers[self.nodes_init - 1]])

        # Check if node has to be killed or restarted during rebalance
        # Monitor Rebalance
        msg = "rebalance failed while removing failover nodes {0}" \
              .format(chosen)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        # Drain ep_queue & make sure that intra-cluster replication is complete
        self.bucket_util._wait_for_stats_all_buckets(
            check_ep_items_remaining=True)

        self.log.info("Begin VERIFICATION for Add-back and rebalance")

        # Verify Stats of cluster and Data is max_verify > 0
        self.bucket_util.verify_cluster_stats(self.servers,
                                              self.master,
                                              check_bucket_stats=True,
                                              check_ep_items_remaining=True)

        # Verify recovery Type succeeded if we added-back nodes
        self.verify_for_recovery_type(chosen, self.server_map, self.buckets,
                                      recoveryTypeMap, fileMapsForVerification,
                                      self.deltaRecoveryBuckets)

        # Comparison of all data if required
        if not self.withMutationOps:
            self.sleep(60)
            self.bucket_util.data_analysis_all(record_static_data_set,
                                               self.servers,
                                               self.buckets,
                                               path=None,
                                               addedItems=None)

        # Verify if vbucket sequence numbers and failover logs are as expected
        # We will check only for version > 2.5.* & if the failover is graceful
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_vbucket_stats = self.bucket_util.compare_vbucket_seqnos(
                prev_vbucket_stats, self.servers, self.buckets, perNode=False)
            new_failover_stats = self.bucket_util.compare_failovers_logs(
                prev_failover_stats, self.servers, self.buckets)
            self.bucket_util.compare_vbucketseq_failoverlogs(
                new_vbucket_stats, new_failover_stats)

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master)
            self.bucket_util.vb_distribution_analysis(
                servers=nodes,
                buckets=self.buckets,
                num_replicas=self.num_replicas,
                total_vbuckets=self.total_vbuckets,
                std=20.0)

        self.log.info("End VERIFICATION for Add-back and rebalance")

    def print_test_params(self, failover_reason):
        """ Method to print test parameters """
        self.log.info("num_replicas : {0}".format(self.num_replicas))
        self.log.info("recoveryType : {0}".format(self.recoveryType))
        self.log.info("failover_reason : {0}".format(failover_reason))
        self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes))
        self.log.info('picking server : {0} as the master'.format(self.master))

    def run_failover_operations(self, chosen, failover_reason):
        """
        Method to run fail over operations used in the test scenario based on
        failover reason
        """
        # Perform Operations related to failover
        graceful_count = 0
        graceful_failover = True
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable = True
                self.cluster_util.stop_server(node)
                self.log.info("10 secs delay for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(
                    RestHelper(self.rest).wait_for_node_status(
                        node, "unhealthy", self.wait_timeout * 10),
                    msg="node status is healthy even after waiting for 5 mins")
            elif failover_reason == "firewall":
                unreachable = True
                self.filter_list.append(node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(
                    server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(
                    node, "unhealthy", self.wait_timeout * 10)
                if status:
                    self.log.info(
                        "node {0}:{1} is 'unhealthy' as expected".format(
                            node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command(
                                    "netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command(
                                    "/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail(
                        "node status is not unhealthy even after waiting for 5 minutes"
                    )
            # verify the failover type
            if self.check_verify_failover_type:
                graceful_count, graceful_failover = self.verify_failover_type(
                    node, graceful_count, self.num_replicas, unreachable)
            # define precondition check for failover
            success_failed_over = self.rest.fail_over(
                node.id, graceful=(self.graceful and graceful_failover))
            if self.graceful and graceful_failover:
                if self.stopGracefulFailover or self.killNodes \
                        or self.stopNodes or self.firewallOnNodes:
                    self.victim_node_operations(node)
                    # Start Graceful Again
                    self.log.info(" Start Graceful Failover Again !")
                    self.sleep(60)
                    success_failed_over = self.rest.fail_over(
                        node.id,
                        graceful=(self.graceful and graceful_failover))
                    msg = "graceful failover failed for nodes {0}" \
                          .format(node.id)
                    self.assertTrue(
                        self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
                else:
                    msg = "rebalance failed while removing failover nodes {0}"\
                          .format(node.id)
                    self.assertTrue(
                        self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
            failed_over = failed_over and success_failed_over

            # Check for negative cases
            if self.graceful and (failover_reason
                                  in ['stop_server', 'firewall']):
                if failed_over:
                    # MB-10479
                    self.rest.print_UI_logs()
                self.assertFalse(
                    failed_over,
                    "Graceful Failover was started for unhealthy node!!!")
                return
            elif self.gracefulFailoverFail and not failed_over:
                """ Check if the fail_over fails as expected """
                self.assertFalse(
                    failed_over,
                    "Graceful failover should fail due to not enough replicas")
                return

            # Check if failover happened as expected or re-try one more time
            if not failed_over:
                self.log.info(
                    "unable to failover the node the first time. try again in 60 seconds.."
                )
                # try again in 75 seconds
                self.sleep(75)
                failed_over = self.rest.fail_over(
                    node.id, graceful=(self.graceful and graceful_failover))
            if self.graceful and (failover_reason
                                  not in ['stop_server', 'firewall']):
                reached = RestHelper(self.rest).rebalance_reached()
                self.assertTrue(
                    reached,
                    "rebalance failed for Graceful Failover, stuck or did not completed"
                )

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.filter_servers(self.servers, chosen)
            self.bucket_util.vb_distribution_analysis(
                servers=nodes,
                buckets=self.buckets,
                std=20.0,
                num_replicas=self.num_replicas,
                total_vbuckets=self.total_vbuckets,
                type="failover",
                graceful=(self.graceful and graceful_failover))

    def run_failover_operations_with_ops(self, chosen, failover_reason):
        """
        Method to run fail over operations used in the test scenario based on
        failover reason
        """
        # Perform Operations relalted to failover
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable = True
                self.cluster_util.stop_server(node)
                self.log.info(
                    "10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(
                    RestHelper(self.rest).wait_for_node_status(
                        node, "unhealthy", 300),
                    msg=
                    "node status is not unhealthy even after waiting for 5 minutes"
                )
            elif failover_reason == "firewall":
                unreachable = True
                self.filter_list.append(node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(
                    server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(
                    node, "unhealthy", 300)
                if status:
                    self.log.info(
                        "node {0}:{1} is 'unhealthy' as expected".format(
                            node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command(
                                    "netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command(
                                    "/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail(
                        "node status is not unhealthy even after waiting for 5 minutes"
                    )
            nodes = self.filter_servers(self.servers, chosen)
            success_failed_over = self.rest.fail_over(node.id,
                                                      graceful=(self.graceful))
            # failed_over = self.task.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful)
            # Perform Compaction
            compact_tasks = []
            if self.compact:
                for bucket in self.buckets:
                    compact_tasks.append(
                        self.task.async_compact_bucket(self.master, bucket))
            # Run View Operations
            if self.withViewsOps:
                self.query_and_monitor_view_tasks(nodes)
            # Run mutation operations
            if self.withMutationOps:
                self.run_mutation_operations()
            # failed_over.result()
            msg = "rebalance failed while removing failover nodes {0}" \
                  .format(node.id)
            self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True),
                            msg=msg)
            for task in compact_tasks:
                task.result()
        # msg = "rebalance failed while removing failover nodes {0}".format(node.id)
        # self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

    def load_all_buckets(self, gen, op):
        tasks = []
        for bucket in self.bucket_util.buckets:
            tasks.append(
                self.task.async_load_gen_docs(
                    self.cluster,
                    bucket,
                    gen,
                    op,
                    0,
                    batch_size=20,
                    process_concurrency=1,
                    durability=self.durability_level,
                    timeout_secs=self.sdk_timeout,
                    #durability_timeout=self.durability_timeout
                ))
        for task in tasks:
            self.task.jython_task_manager.get_task_result(task)

    def load_initial_data(self):
        """ Method to run operations Update/Delete/Create """
        # Load All Buckets if num_items > 0
        self.load_all_buckets(self.gen_initial_create, "create")
        self.bucket_util._wait_for_stats_all_buckets(
            check_ep_items_remaining=True)
        # self.bucket_util._verify_stats_all_buckets(self.servers, timeout=120)

    def run_mutation_operations(self):
        if ("create" in self.doc_ops):
            self.load_all_buckets(self.gen_create, "create")
        if ("update" in self.doc_ops):
            self.load_all_buckets(self.gen_update, "update")
        if ("delete" in self.doc_ops):
            self.load_all_buckets(self.gen_delete, "delete")

    def run_mutation_operations_after_failover(self):
        if ("create" in self.doc_ops):
            self.load_all_buckets(self.afterfailover_gen_create, "create")
        if ("update" in self.doc_ops):
            self.load_all_buckets(self.afterfailover_gen_update, "update")
        if ("delete" in self.doc_ops):
            self.load_all_buckets(self.afterfailover_gen_delete, "delete")

    def define_maps_during_failover(self, recoveryType=[]):
        """ Method to define nope ip, recovery type map """
        recoveryTypeMap = {}
        index = 0
        for server in self.chosen:
            if recoveryType:
                recoveryTypeMap[server.ip] = recoveryType[index]
            index += 1
        return recoveryTypeMap

    def filter_servers(self, original_servers, filter_servers):
        """ Filter servers that have not failed over """
        _servers_ = copy.deepcopy(original_servers)
        for failed in filter_servers:
            for server in _servers_:
                if server.ip == failed.ip:
                    _servers_.remove(server)
                    self._cleanup_nodes.append(server)
        return _servers_

    def verify_for_recovery_type(self,
                                 chosen=[],
                                 serverMap={},
                                 buckets=[],
                                 recoveryTypeMap={},
                                 fileMap={},
                                 deltaRecoveryBuckets=[]):
        """ Verify recovery type is delta or full """
        summary = ""
        logic = True
        for server in self.chosen:
            shell = RemoteMachineShellConnection(serverMap[server.ip])
            os_type = shell.extract_remote_info()
            if os_type.type.lower() == 'windows':
                return
            for bucket in buckets:
                path = fileMap[server.ip][bucket.name]
                exists = shell.file_exists(path, "check.txt")
                if deltaRecoveryBuckets != None:
                    if recoveryTypeMap[server.ip] == "delta" and (
                            bucket.name
                            in deltaRecoveryBuckets) and not exists:
                        logic = False
                        summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(
                            server.ip, bucket.name)
                    elif recoveryTypeMap[server.ip] == "delta" and (
                            bucket.name
                            not in deltaRecoveryBuckets) and exists:
                        summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format(
                            server.ip, bucket.name)
                        logic = False
                else:
                    if recoveryTypeMap[server.ip] == "delta" and not exists:
                        logic = False
                        summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(
                            server.ip, bucket.name)
                    elif recoveryTypeMap[server.ip] == "full" and exists:
                        logic = False
                        summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format(
                            server.ip, bucket.name)
            shell.disconnect()
        self.assertTrue(logic, summary)

    def run_view_creation_operations(self, servers):
        """" Run view Creation and indexing building tasks on servers """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        num_tries = self.input.param("num_tries", 10)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]

        query = {}
        query["connectionTimeout"] = 60000
        query["full_set"] = "true"

        views = []
        tasks = []
        for bucket in self.buckets:
            temp = self.bucket_util.make_default_views(self.default_view,
                                                       self.default_view_name,
                                                       num_views,
                                                       is_dev_ddoc,
                                                       different_map=False)
            temp_tasks = self.bucket_util.async_create_views(
                self.master, ddoc_name, temp, bucket)
            views += temp
            tasks += temp_tasks

        timeout = max(
            self.wait_timeout * 4,
            len(self.buckets) * self.wait_timeout * self.num_items / 50000)

        for task in tasks:
            task.result(self.wait_timeout * 20)

    def query_and_monitor_view_tasks(self, servers):
        """ Monitor Query Tasks for their completion """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]
        self.verify_query_task()
        active_tasks = self.cluster_util.async_monitor_active_task(
            servers,
            "indexer",
            "_design/" + prefix + ddoc_name,
            wait_task=False)
        for active_task in active_tasks:
            result = self.task.jython_task_manager.get_task_result(active_task)
            self.assertTrue(result)

    def verify_query_task(self):
        """ Verify Query Results """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]
        query = dict()
        query["connectionTimeout"] = 60000
        query["full_set"] = "true"
        expected_rows = None
        timeout = None
        if self.active_resident_threshold == 0:
            timeout = 2400
        if self.max_verify:
            expected_rows = self.max_verify
            query["limit"] = expected_rows
        query["stale"] = "false"
        for bucket in self.buckets:
            self.bucket_util.perform_verify_queries(
                num_views,
                prefix,
                ddoc_name,
                self.default_view_name,
                query,
                bucket=bucket,
                wait_time=timeout,
                expected_rows=expected_rows)

    def create_file(self, chosen, buckets, serverMap):
        """
        Created files in data paths for checking if delta/full recovery occured
        """
        fileMap = {}
        for server in self.chosen:
            shell = RemoteMachineShellConnection(serverMap[server.ip])
            type = shell.extract_remote_info().distribution_type
            map = dict()
            for bucket in buckets:
                if type.lower() == 'windows':
                    self.data_path = 'c:/Program\ Files/Couchbase/Server/var/lib/couchbase/data'
                bucket_data_path = self.data_path + "/" + bucket.name + "/" + "check.txt"
                full_path = self.data_path + "/" + bucket.name + "/"
                map[bucket.name] = full_path
                shell.create_file(bucket_data_path, "check")
            fileMap[server.ip] = map
            shell.disconnect()
        return fileMap

    def verify_failover_type(self,
                             chosen=None,
                             graceful_count=0,
                             replica_count=0,
                             unreachable=False):
        logic = True
        summary = ""
        nodes = self.rest.node_statuses()
        node_count = len(nodes)
        change_graceful_count = graceful_count
        graceful_failover = True
        if unreachable:
            node_count -= 1
        else:
            change_graceful_count += 1
        if replica_count != 0:
            for node in nodes:
                if unreachable and node.ip == chosen.ip:
                    graceful_failover = node.gracefulFailoverPossible
                    if node.gracefulFailoverPossible:
                        logic = False
                        summary += "\n failover type for unreachable node {0} Expected :: Hard, Actual :: Graceful".format(
                            node.ip)
                elif node.ip == chosen.ip:
                    graceful_failover = node.gracefulFailoverPossible
                    if replica_count > graceful_count and (
                            node_count - 1) + graceful_count >= replica_count:
                        if not node.gracefulFailoverPossible:
                            logic = False
                            summary += "\n failover type for node {0} Expected :: Graceful, Actual :: Hard".format(
                                node.ip)
                    else:
                        if node.gracefulFailoverPossible:
                            logic = False
                            summary += "\n failover type for  {0} Expected :: Hard, Actual :: Graceful".format(
                                node.ip)
        else:
            for node in nodes:
                if node.ip == chosen.ip:
                    graceful_failover = node.gracefulFailoverPossible
                    if node.gracefulFailoverPossible:
                        logic = False
                        summary += "\n failover type for node {0} Expected :: Hard, Actual :: Graceful".format(
                            node.ip)
        self.assertTrue(logic, summary)
        return change_graceful_count, graceful_failover

    def victim_node_operations(self, node=None):
        if self.stopGracefulFailover:
            self.log.info(" Stopping Graceful Failover ")
            stopped = self.rest.stop_rebalance(wait_timeout=self.wait_timeout /
                                               3)
            self.assertTrue(stopped, msg="unable to stop rebalance")
        if self.killNodes:
            self.log.info(" Killing Memcached ")
            kill_nodes = self.cluster_util.get_victim_nodes(
                self.servers, self.master, node, self.victim_type,
                self.victim_count)
            for kill_node in kill_nodes:
                self.cluster_util.kill_server_memcached(kill_node)
        if self.stopNodes:
            self.log.info(" Stopping Node")
            stop_nodes = self.cluster_util.get_victim_nodes(
                self.servers, self.master, node, self.victim_type,
                self.victim_count)
            for stop_node in stop_nodes:
                self.cluster_util.stop_server(stop_node)
            self.sleep(10)
            for start_node in stop_nodes:
                self.cluster_util.start_server(start_node)
        if self.firewallOnNodes:
            self.log.info(" Enabling Firewall for Node ")
            stop_nodes = self.cluster_util.get_victim_nodes(
                self.servers, self.master, node, self.victim_type,
                self.victim_count)
            for stop_node in stop_nodes:
                self.cluster_util.start_firewall_on_node(stop_node)
            self.sleep(30)
            self.log.info(" Disable Firewall for Node ")
            for start_node in stop_nodes:
                self.cluster_util.stop_firewall_on_node(start_node)
        self.sleep(60)

    def get_server_map(self):
        """ Map of ips and server information """
        server_map = dict()
        for server in self.cluster.servers:
            server_map[server.ip] = server
        return server_map
Ejemplo n.º 40
0
def create_rest(server_ip=cfg.COUCHBASE_IP, port=cfg.COUCHBASE_PORT,
                username=cfg.COUCHBASE_USER, password=cfg.COUCHBASE_PWD):
    return RestConnection(create_server_obj(server_ip, port, username, password))
Ejemplo n.º 41
0
    def cleanup_cluster(servers, wait_for_rebalance=True, master=None):
        log = logger.Logger.get_logger()
        if master is None:
            master = servers[0]
        rest = RestConnection(master)
        helper = RestHelper(rest)
        helper.is_ns_server_running(
            timeout_in_seconds=testconstants.NS_SERVER_TIMEOUT)
        nodes = rest.node_statuses()
        master_id = rest.get_nodes_self().id
        for node in nodes:
            if int(node.port) in xrange(9091, 9991):
                rest.eject_node(node)
                nodes.remove(node)

        if len(nodes) > 1:
            log.info("rebalancing all nodes in order to remove nodes")
            rest.log_client_error("Starting rebalance from test, ejected nodes %s" % \
                                                             [node.id for node in nodes if node.id != master_id])
            removed = helper.remove_nodes(
                knownNodes=[node.id for node in nodes],
                ejectedNodes=[
                    node.id for node in nodes if node.id != master_id
                ],
                wait_for_rebalance=wait_for_rebalance)
            success_cleaned = []
            alt_addr = TestInputSingleton.input.param("alt_addr", False)
            for removed in [node for node in nodes if (node.id != master_id)]:
                removed.rest_password = servers[0].rest_password
                removed.rest_username = servers[0].rest_username
                try:
                    if alt_addr:
                        for server in servers:
                            shell = RemoteMachineShellConnection(server)
                            internal_IP = shell.get_ip_address()
                            internal_IP = [
                                x for x in internal_IP if x != "127.0.0.1"
                            ]
                            shell.disconnect()
                            if internal_IP == removed.ip:
                                rest = RestConnection(server)
                                break
                    else:
                        rest = RestConnection(removed)
                except Exception as ex:
                    log.error(
                        "can't create rest connection after rebalance out for ejected nodes,\
                        will retry after 10 seconds according to MB-8430: {0} "
                        .format(ex))
                    time.sleep(10)
                    rest = RestConnection(removed)
                start = time.time()
                while time.time() - start < 30:
                    if len(rest.get_pools_info()["pools"]) == 0:
                        success_cleaned.append(removed)
                        break
                    else:
                        time.sleep(0.1)
                if time.time() - start > 10:
                    log.error("'pools' on node {0}:{1} - {2}".format(
                        removed.ip, removed.port,
                        rest.get_pools_info()["pools"]))
            for node in set([node for node in nodes if (node.id != master_id)
                             ]) - set(success_cleaned):
                log.error(
                    "node {0}:{1} was not cleaned after removing from cluster".
                    format(removed.ip, removed.port))
                try:
                    if alt_addr:
                        for server in servers:
                            shell = RemoteMachineShellConnection(server)
                            internal_IP = shell.get_ip_address()
                            internal_IP = [
                                x for x in internal_IP if x != "127.0.0.1"
                            ]
                            shell.disconnect()
                            if internal_IP == removed.ip:
                                rest = RestConnection(server)
                                break
                    else:
                        rest = RestConnection(node)
                    if not alt_addr:
                        rest.force_eject_node()
                except Exception as ex:
                    log.error("force_eject_node {0}:{1} failed: {2}".format(
                        removed.ip, removed.port, ex))
            if len(set([node for node in nodes if (node.id != master_id)])\
                    - set(success_cleaned)) != 0:
                if not alt_addr:
                    raise Exception(
                        "not all ejected nodes were cleaned successfully")

            log.info("removed all the nodes from cluster associated with {0} ? {1}".format(servers[0], \
                    [(node.id, node.port) for node in nodes if (node.id != master_id)]))
Ejemplo n.º 42
0
    def common_test_body(self, failover_reason, rebalance_type=None):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case(before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARD/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta)
                with rebalance
            5. Verify all expected operations completed by checking
               stats, replicaiton, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.filter_list = []
        if self.failoverMaster:
            self.master = self.cluster.servers[1]
        else:
            self.master = self.cluster.master
        self.log.info(
            " Picking node {0} as reference node for test case".format(
                self.master.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.master)
        self.nodes = self.rest.node_statuses()
        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Variable to decide the durability outcome
        durability_will_fail = False
        # Variable to track the number of nodes failed
        num_nodes_failed = 1

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 \
                and (self.graceful or self.recoveryType is not None):
            self.log.error(
                "Can't apply graceful failover to nodes with version < 3.*")
            self.log.error("Please check configuration params: SKIPPING TEST")
            return

        # Find nodes that will under go failover
        if self.failoverMaster:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=1, target_node=self.servers[0])
        else:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withMutationOps = True => Run Operations in parallel to failover
        # self.withMutationOps = False => Run Operations Before failover
        self.load_initial_data()
        if not self.withMutationOps:
            self.run_mutation_operations()
        # Perform View Creation Tasks and
        # check for completion if required before failover
        if self.withViewsOps:
            self.run_view_creation_operations(self.servers)
            if not self.createIndexesDuringFailover:
                self.query_and_monitor_view_tasks(self.servers)

        # Take snap-shot of data set used for validation
        record_static_data_set = {}
        prev_vbucket_stats = {}
        prev_failover_stats = {}
        if not self.withMutationOps:
            record_static_data_set = self.bucket_util.get_data_set_all(
                self.cluster.servers, self.buckets, path=None)

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(
                self.servers, self.buckets)
            prev_failover_stats = self.bucket_util.get_failovers_logs(
                self.servers, self.buckets)

        # Perform Operations related to failover
        if self.withMutationOps or self.withViewsOps or self.compact:
            self.run_failover_operations_with_ops(self.chosen, failover_reason)
        else:
            self.run_failover_operations(self.chosen, failover_reason)

        target_bucket = self.bucket_util.buckets[0]

        # Update new_replica value, if provided in the conf
        if self.new_replica:
            self.num_replicas = self.new_replica
            bucket_helper = BucketHelper(self.master)
            bucket_helper.change_bucket_props(target_bucket.name,
                                              replicaNumber=self.num_replicas)

        # Decide whether the durability is going to fail or not
        if self.num_failed_nodes >= 1 and self.num_replicas > 1:
            durability_will_fail = True

        # Construct target vbucket list from the nodes
        # which are going to be failed over
        vbucket_list = list()
        for target_node in self.chosen:
            for server in self.servers:
                if server.ip == target_node.ip:
                    # Comment out the break once vbucket_list method is fixed
                    break
                    shell_conn = RemoteMachineShellConnection(server)
                    cb_stats = Cbstats(shell_conn)
                    vbuckets = cb_stats.vbucket_list(target_bucket.name,
                                                     self.target_vbucket_type)
                    shell_conn.disconnect()
                    vbucket_list += vbuckets

        # Code to generate doc_loaders that will work on vbucket_type
        # based on targeted nodes. This will perform CRUD only on
        # vbuckets which will be affected by the failover
        self.gen_create = doc_generator(self.key,
                                        self.num_items,
                                        self.num_items * 1.5,
                                        target_vbucket=vbucket_list)
        self.gen_update = doc_generator(self.key,
                                        self.num_items / 2,
                                        self.num_items,
                                        target_vbucket=vbucket_list)
        self.gen_delete = doc_generator(self.key,
                                        self.num_items / 4,
                                        self.num_items / 2 - 1,
                                        target_vbucket=vbucket_list)
        self.afterfailover_gen_create = doc_generator(
            self.key,
            self.num_items * 1.6,
            self.num_items * 2,
            target_vbucket=vbucket_list)
        self.afterfailover_gen_update = doc_generator(
            self.key, 1, self.num_items / 4, target_vbucket=vbucket_list)
        self.afterfailover_gen_delete = doc_generator(
            self.key,
            self.num_items * 0.5,
            self.num_items * 0.75,
            target_vbucket=vbucket_list)

        # Perform Add Back Operation with Rebalance
        # or only Rebalance with verifications
        if not self.gracefulFailoverFail and self.runRebalanceAfterFailover:
            if self.failover_onebyone:
                # Reset it back to False
                durability_will_fail = False
                for node_chosen in self.chosen:
                    if num_nodes_failed > 1:
                        durability_will_fail = True

                    if self.add_back_flag:
                        # In add-back case, durability should never fail, since
                        # the num_nodes in the cluster will remain the same
                        self.run_add_back_operation_and_verify(
                            [node_chosen],
                            prev_vbucket_stats,
                            record_static_data_set,
                            prev_failover_stats,
                            rebalance_type=rebalance_type)
                    else:
                        self.run_rebalance_after_failover_and_verify(
                            [node_chosen],
                            prev_vbucket_stats,
                            record_static_data_set,
                            prev_failover_stats,
                            durability_will_fail=durability_will_fail)
                    num_nodes_failed += 1
            else:
                if self.add_back_flag:
                    self.run_add_back_operation_and_verify(
                        self.chosen,
                        prev_vbucket_stats,
                        record_static_data_set,
                        prev_failover_stats,
                        durability_will_fail=durability_will_fail,
                        rebalance_type=rebalance_type)
                else:
                    self.run_rebalance_after_failover_and_verify(
                        self.chosen,
                        prev_vbucket_stats,
                        record_static_data_set,
                        prev_failover_stats,
                        durability_will_fail=durability_will_fail)
        else:
            return

        # Will verify_unacked_bytes only if the durability is not going to fail
        if self.during_ops is None and not durability_will_fail:
            self.bucket_util.verify_unacked_bytes_all_buckets(
                filter_list=self.filter_list)
Ejemplo n.º 43
0
    def common_test_body(self, failover_reason):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case (before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance
            5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.filter_list = []
        if self.failoverMaster:
            self.master = self.cluster.servers[1]
        else:
            self.master = self.cluster.master
        self.log.info(
            " Picking node {0} as reference node for test case".format(
                self.master.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.master)
        self.nodes = self.rest.node_statuses()
        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 and (self.graceful or
                                                  (self.recoveryType != None)):
            self.log.error(
                "Graceful failover can't be applied to nodes with version less then 3.*"
            )
            self.log.error(
                "Please check configuration parameters: SKIPPING TEST.")
            return

        # Find nodes that will under go failover
        if self.failoverMaster:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=1, target_node=self.servers[0])
        else:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withMutationOps = True => Run Operations in parallel to failover
        # self.withMutationOps = False => Run Operations Before failover
        self.load_initial_data()
        if not self.withMutationOps:
            self.run_mutation_operations()
        # Perform View Creation Tasks and check for completion if required before failover
        if self.withViewsOps:
            self.run_view_creation_operations(self.servers)
            if not self.createIndexesDuringFailover:
                self.query_and_monitor_view_tasks(self.servers)

        # Take snap-shot of data set used for validaiton
        record_static_data_set = {}
        prev_vbucket_stats = {}
        prev_failover_stats = {}
        if not self.withMutationOps:
            record_static_data_set = self.bucket_util.get_data_set_all(
                self.cluster.servers, self.buckets, path=None)

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(
                self.servers, self.buckets)
            prev_failover_stats = self.bucket_util.get_failovers_logs(
                self.servers, self.buckets)

        # Perform Operations relalted to failover
        if self.withMutationOps or self.withViewsOps or self.compact:
            self.run_failover_operations_with_ops(self.chosen, failover_reason)
        else:
            self.run_failover_operations(self.chosen, failover_reason)

        # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons
        if not self.gracefulFailoverFail and self.runRebalanceAfterFailover:
            if self.add_back_flag:
                self.run_add_back_operation_and_verify(self.chosen,
                                                       prev_vbucket_stats,
                                                       record_static_data_set,
                                                       prev_failover_stats)
            else:
                self.run_rebalance_after_failover_and_verify(
                    self.chosen, prev_vbucket_stats, record_static_data_set,
                    prev_failover_stats)
        else:
            return

        if self.during_ops == None:
            self.bucket_util.verify_unacked_bytes_all_buckets(
                filter_list=self.filter_list, master_node=self.master)
Ejemplo n.º 44
0
 def set_vbuckets(master, vbuckets):
     rest = RestConnection(master)
     command = "rpc:eval_everywhere(ns_config, set, [couchbase_num_vbuckets_default, {0}]).".format(
         vbuckets)
     status, content = rest.diag_eval(command)
     return status, content
Ejemplo n.º 45
0
    def test_xdcr(self):
        _rep_type = self.input.param("replication_type",
                                     "capi")  # capi or xmem
        _bixdcr = self.input.param("bidirectional", "false")
        _clusters_dic = self.input.clusters
        _src_nodes = copy.copy(_clusters_dic[0])
        _src_master = _src_nodes[0]
        _dest_nodes = copy.copy(_clusters_dic[1])
        _dest_master = _dest_nodes[0]

        # Build source cluster
        self.init_rebalance_cluster_create_testbucket(_src_master, _src_nodes)
        # Build destination cluster
        self.init_rebalance_cluster_create_testbucket(_dest_master,
                                                      _dest_nodes)

        # Setting up XDCR
        self.setup_xdcr_start_replication(_src_master, _dest_master, _rep_type,
                                          _bixdcr)

        shell1 = RemoteMachineShellConnection(_src_master)
        shell2 = RemoteMachineShellConnection(_dest_master)
        src_item_count = 0
        dest_item_count = 0
        if self._os == "centos" or self._os == "ubuntu":
            self.log.info("Load {0} through cbworkloadgen at src..".format(
                self.num_items))
            _1 = "cd /home/{0}/opt/couchbase &&".format(
                _src_master.ssh_username)
            _2 = " ./bin/cbworkloadgen -n localhost:8091 --prefix=s_"
            _3 = " -r .8 -i {0} -s 256 -b testbucket -t 1".format(
                self.num_items)
            _4 = " -u {0} -p {1}".format(_src_master.rest_username,
                                         _src_master.rest_password)
            command_to_load = _1 + _2 + _3 + _4
            o, e = shell1.execute_non_sudo_command(command_to_load)
            shell1.log_command_output(o, e)
            time.sleep(20)
            rest = RestConnection(_src_master)
            src_item_count = rest.fetch_bucket_stats(
                bucket="testbucket")["op"]["samples"]["curr_items"][-1]
            if _bixdcr:
                self.log.info("Load {0} through cbworkloadgen at src..".format(
                    self.num_items))
                _1 = "cd /home/{0}/opt/couchbase &&".format(
                    _dest_master.ssh_username)
                _2 = " ./bin/cbworkloadgen -n localhost:8091 --prefix=d_"
                _3 = " -r .8 -i {0} -s 256 -b testbucket -t 1".format(
                    self.num_items)
                _4 = " -u {0} -p {1}".format(_dest_master.rest_username,
                                             _dest_master.rest_password)
                command_to_load = _1 + _2 + _3 + _4
                o, e = shell2.execute_non_sudo_command(command_to_load)
                shell2.log_command_output(o, e)
                time.sleep(20)
                rest = RestConnection(_dest_master)
                dest_item_count = rest.fetch_bucket_stats(
                    bucket="testbucket")["op"]["samples"]["curr_items"][-1]
            self.wait_for_replication_to_catchup(_src_master, _dest_master,
                                                 1200, "destination")
            if _bixdcr:
                self.wait_for_replication_to_catchup(_dest_master, _src_master,
                                                     1200, "source")
            self.log.info("XDC REPLICATION caught up")
            rest1 = RestConnection(_src_master)
            rest2 = RestConnection(_dest_master)
            curr_count_on_src = rest1.fetch_bucket_stats(
                bucket="testbucket")["op"]["samples"]["curr_items"][-1]
            curr_count_on_dest = rest2.fetch_bucket_stats(
                bucket="testbucket")["op"]["samples"]["curr_items"][-1]
            assert curr_count_on_src == (
                src_item_count +
                dest_item_count), "ItemCount on source not what's expected"
            assert curr_count_on_dest == (
                src_item_count + dest_item_count
            ), "ItemCount on destination not what's expected"
        elif self._os == "windows":
            # TODO: Windows support
            self.log.info("Yet to add support for windows!")
            pass
        shell1.disconnect()
        shell2.disconnect()
Ejemplo n.º 46
0
 count = 0
 prefix = str(uuid.uuid4())[:6]
 count = 10 * 1000 * 1000 * 1000
 size = 512
 expiry = 0
 if "count" in params:
     count = int(params["count"])
     print count
 if "size" in params:
     size = int(params["size"])
 if "prefix" in params:
     prefix = params["prefix"]
 #if "expiry" in params:
 #    expiry = int(params["expiry"])
 payload = MemcachedClientHelper.create_value('*', size)
 rest = RestConnection(server)
 buckets = rest.get_buckets()
 for bucket in buckets:
     smart = VBucketAwareMemcached(rest, bucket.name)
     mc = MemcachedClientHelper.proxy_client(server, bucket.name)
     i = 0
     while i < count:
         try:
             key = "{0}-{1}".format(prefix, i)
             #mc = smart.memcached(key)
             #do an expiry every 10 times
             mc.set(key, 0, 0, payload)
             #mc.get(key)
             i += 1
         except Exception as ex:
             #print ex
Ejemplo n.º 47
0
 def _get_cluster_ca_cert(self):
     rest = RestConnection(self.host)
     api = rest.baseUrl + "pools/default/certificate?extended=true"
     status, content, header = rest._http_request(api, 'GET')
     return status, content, header
Ejemplo n.º 48
0
    def common_test_body(self, keys_count, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(self.num_replicas))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master, self.gen_create, "create", 0,
                               batch_size=10000, pause_secs=5, timeout_secs=180)
        self._wait_for_stats_all_buckets(self.servers)

        _servers_ = self.servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        RebalanceHelper.wait_for_replication(self.servers, self.cluster)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas)
        for node in chosen:
            #let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info("10 seconds delay to wait for membase-server to shutdown")
                #wait for 5 minutes until node is down
                self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    #verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    for i in rest.get_logs(): self.log.error(i)
                    api = rest.baseUrl + 'nodeStatuses'
                    status, content, header = rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info("unable to failover the node the first time. try again in  60 seconds..")
                #try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info("10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            # Need a delay > min because MB-7168
            log.info("60 seconds sleep after failover before invoking rebalance...")
            time.sleep(60)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[node.id for node in chosen])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
            for failed in chosen:
                for server in _servers_:
                    if server.ip == failed.ip:
                         _servers_.remove(server)
                         self._cleanup_nodes.append(server)

        log.info("Begin VERIFICATION ...")
        RebalanceHelper.wait_for_replication(_servers_, self.cluster)
        self.verify_cluster_stats(_servers_, self.master)
Ejemplo n.º 49
0
    def test_ingestion_after_kv_rollback_cbas_disconnected(self):
        self.setup_for_test()

        # Stop Persistence on Node A & Node B
        self.log.info("Stopping persistence on NodeA")
        mem_client = MemcachedClientHelper.direct_client(
            self.cluster.master, self.cb_bucket_name)
        mem_client.stop_persistence()

        # Perform Create, Update, Delete ops in the CB bucket
        self.log.info("Performing Mutations")
        self.perform_doc_ops_in_all_cb_buckets("delete", 0, self.num_items / 2)

        # Count no. of items in CB & CBAS Buckets
        kv_nodes = self.get_kv_nodes(self.servers, self.cluster.master)
        items_in_cb_bucket = 0
        for node in kv_nodes:
            items_in_cb_bucket += self.get_item_count(node,
                                                      self.cb_bucket_name)

        items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(
            self.cbas_dataset_name)
        items_before_rollback = items_in_cbas_bucket
        self.log.info(
            "Before Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s",
            items_in_cb_bucket, items_in_cbas_bucket)

        self.assertTrue(
            items_in_cb_bucket == items_in_cbas_bucket,
            "Before Rollback : # Items in CBAS bucket does not match that in the CB bucket"
        )

        self.cbas_util.disconnect_from_bucket(self.cbas_bucket_name)
        # Kill memcached on Node A so that Node B becomes master
        self.log.info("Kill Memcached process on NodeA")
        shell = RemoteMachineShellConnection(self.cluster.master)
        shell.kill_memcached()
        #         self.sleep(10,"Wait for 10 secs for memcached restarts.")
        if self.input.param('kill_cbas', False):
            shell = RemoteMachineShellConnection(self.cbas_node)
            shell.kill_process("/opt/couchbase/lib/cbas/runtime/bin/java",
                               "java")
            shell.kill_process("/opt/couchbase/bin/cbas", "cbas")

        tries = 60
        result = False
        while tries > 0 and not result:
            try:
                result = self.cbas_util.connect_to_bucket(
                    self.cbas_bucket_name)
                tries -= 1
            except:
                pass
            self.sleep(2)
        self.assertTrue(
            result,
            "CBAS connect bucket failed after memcached killed on KV node.")
        curr = time.time()
        while items_in_cbas_bucket != 0 and items_in_cbas_bucket <= items_before_rollback:
            items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(
                self.cbas_dataset_name)
            if curr + 120 < time.time():
                break
        self.assertTrue(items_in_cbas_bucket > items_before_rollback,
                        "Roll-back did not happen.")
        self.log.info("#######BINGO########\nROLLBACK HAPPENED")

        curr = time.time()
        while items_in_cb_bucket != items_in_cbas_bucket:
            items_in_cb_bucket = 0
            items_in_cbas_bucket = 0
            if self.where_field and self.where_value:
                try:
                    items_in_cb_bucket = RestConnection(
                        self.cluster.master).query_tool(
                            'select count(*) from %s where %s = "%s"' %
                            (self.cb_bucket_name, self.where_field,
                             self.where_value))['results'][0]['$1']
                except:
                    self.log.info(
                        "Indexer in rollback state. Query failed. Pass and move ahead."
                    )
                    pass
            else:
                for node in kv_nodes:
                    items_in_cb_bucket += self.get_item_count(
                        node, self.cb_bucket_name)

            self.log.info("Items in CB bucket after rollback: %s" %
                          items_in_cb_bucket)
            items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(
                self.cbas_dataset_name)
            if curr + 120 < time.time():
                break

        # Count no. of items in CB & CBAS Buckets
        items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(
            self.cbas_dataset_name)

        self.log.info(
            "After Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s",
            items_in_cb_bucket, items_in_cbas_bucket)

        self.assertTrue(
            items_in_cb_bucket == items_in_cbas_bucket,
            "After Rollback : # Items in CBAS bucket does not match that in the CB bucket"
        )
Ejemplo n.º 50
0
 def test_bucket_backup_restore(self):
     shell = RemoteMachineShellConnection(self.master)
     self.init_rebalance_cluster_create_testbucket(self.master,
                                                   self.servers)
     if self._os == "centos" or self._os == "ubuntu":
         self.log.info("Load {0} through cbworkloadgen ..".format(
             self.num_items))
         _1 = "cd /home/{0}/opt/couchbase &&".format(
             self.master.ssh_username)
         _2 = " ./bin/cbworkloadgen -n localhost:8091"
         _3 = " -r .8 -i {0} -s 256 -b testbucket -t 1".format(
             self.num_items)
         _4 = " -u {0} -p {1}".format(self.master.rest_username,
                                      self.master.rest_password)
         command_to_load = _1 + _2 + _3 + _4
         o, e = shell.execute_non_sudo_command(command_to_load)
         shell.log_command_output(o, e)
         time.sleep(20)
         rest = RestConnection(self.master)
         ini_item_count = rest.fetch_bucket_stats(
             bucket="testbucket")["op"]["samples"]["curr_items"][-1]
         self.log.info("Backing up bucket 'testbucket' ..")
         _1 = "cd /home/{0}/opt/couchbase &&".format(
             self.master.ssh_username)
         _2 = " ./bin/cbbackup http://localhost:8091"
         _3 = " /home/{0}/backup".format(self.master.ssh_username)
         _4 = " -u {0} -p {1}".format(self.master.rest_username,
                                      self.master.rest_password)
         command_to_backup = _1 + _2 + _3 + _4
         o, e = shell.execute_non_sudo_command(command_to_backup)
         shell.log_command_output(o, e)
         time.sleep(10)
         self.log.info("Deleting bucket ..")
         _1 = "cd /home/{0}/opt/couchbase &&".format(
             self.master.ssh_username)
         _2 = " ./bin/couchbase-cli bucket-delete -c localhost:8091"
         _3 = " --bucket=testbucket"
         _4 = " -u {0} -p {1}".format(self.master.rest_username,
                                      self.master.rest_password)
         command_to_delete_bucket = _1 + _2 + _3 + _4
         o, e = shell.execute_non_sudo_command(command_to_delete_bucket)
         shell.log_command_output(o, e)
         time.sleep(20)
         if len(self.servers) < 2:
             rep_count = 0
         else:
             rep_count = 1
         self.log.info("Recreating bucket ..")
         _1 = "cd /home/{0}/opt/couchbase &&".format(
             self.master.ssh_username)
         _2 = " ./bin/couchbase-cli bucket-create -c localhost:8091"
         _3 = " --bucket=testbucket --bucket-type=couchbase --bucket-port=11211"
         _4 = " --bucket-ramsize=500 --bucket-replica={0} --wait".format(
             rep_count)
         _5 = " -u {0} -p {1}".format(self.master.rest_username,
                                      self.master.rest_password)
         command_to_create_bucket = _1 + _2 + _3 + _4 + _5
         o, e = shell.execute_non_sudo_command(command_to_create_bucket)
         shell.log_command_output(o, e)
         time.sleep(20)
         self.log.info("Restoring bucket 'testbucket' ..")
         _1 = "cd /home/{0}/opt/couchbase &&".format(
             self.master.ssh_username)
         _2 = " ./bin/cbrestore /home/{0}/backup http://localhost:8091".format(
             self.master.ssh_username)
         _3 = " -b testbucket -B testbucket"
         _4 = " -u {0} -p {1}".format(self.master.rest_username,
                                      self.master.rest_password)
         command_to_restore = _1 + _2 + _3 + _4
         o, e = shell.execute_non_sudo_command(command_to_restore)
         shell.log_command_output(o, e)
         time.sleep(10)
         rest = RestConnection(self.master)
         fin_item_count = rest.fetch_bucket_stats(
             bucket="testbucket")["op"]["samples"]["curr_items"][-1]
         self.log.info("Removing backed-up folder ..")
         command_to_remove_folder = "rm -rf /home/{0}/backup".format(
             self.master.ssh_username)
         o, e = shell.execute_non_sudo_command(command_to_remove_folder)
         shell.log_command_output(o, e)
         if (fin_item_count == ini_item_count):
             self.log.info(
                 "Item count before and after deleting with backup/restore matched, {0}={1}"
                 .format(fin_item_count, ini_item_count))
         else:
             self.fail(
                 "Item count didnt match - backup/restore, {0}!={1}".format(
                     fin_item_count, ini_item_count))
         self.log.info("Deleting testbucket ..")
         _1 = "cd /home/{0}/opt/couchbase &&".format(
             self.master.ssh_username)
         _2 = " ./bin/couchbase-cli bucket-delete -c localhost:8091"
         _3 = " --bucket=testbucket"
         _4 = " -u {0} -p {1}".format(self.master.rest_username,
                                      self.master.rest_password)
         command_to_delete_bucket = _1 + _2 + _3 + _4
         o, e = shell.execute_non_sudo_command(command_to_delete_bucket)
         shell.log_command_output(o, e)
         time.sleep(10)
     elif self._os == "windows":
         # TODO: Windows support
         self.log.info("Yet to add support for windows!")
         pass
     shell.disconnect()
Ejemplo n.º 51
0
    def _cluster_setup(self):
        replicas = self.input.param("replicas", 1)
        keys_count = self.input.param("keys-count", 0)
        num_buckets = self.input.param("num-buckets", 1)

        bucket_name = "default"
        master = self.servers[0]
        credentials = self.input.membase_settings
        rest = RestConnection(self.master)
        info = rest.get_nodes_self()
        rest.init_cluster(username=self.master.rest_username,
                          password=self.master.rest_password)
        rest.init_cluster_memoryQuota(memoryQuota=info.mcdMemoryReserved)
        rest.reset_autofailover()
        ClusterOperationHelper.add_and_rebalance(self.servers, True)

        if num_buckets == 1:
            bucket_ram = info.memoryQuota * 2 / 3
            rest.create_bucket(bucket=bucket_name,
                               ramQuotaMB=bucket_ram,
                               replicaNumber=replicas,
                               proxyPort=info.moxi)
        else:
            created = BucketOperationHelper.create_multiple_buckets(
                self.master, replicas, howmany=num_buckets)
            self.assertTrue(created, "unable to create multiple buckets")

        buckets = rest.get_buckets()
        for bucket in buckets:
            ready = BucketOperationHelper.wait_for_memcached(
                self.master, bucket.name)
            self.assertTrue(ready, msg="wait_for_memcached failed")

        for bucket in buckets:
            inserted_keys_cnt = self.load_data(self.master, bucket.name,
                                               keys_count)
            log.info('inserted {0} keys'.format(inserted_keys_cnt))
Ejemplo n.º 52
0
    def test_rebalance_kv_rollback_create_ops(self):
        self.setup_for_test()
        items_before_persistence_stop = self.cbas_util.get_num_items_in_cbas_dataset(
            self.cbas_dataset_name)[0]
        self.log.info("Items in CBAS before persistence stop: %s" %
                      items_before_persistence_stop)
        # Stop Persistence on Node A & Node B
        self.log.info("Stopping persistence on NodeA")
        mem_client = MemcachedClientHelper.direct_client(
            self.cluster.master, self.cb_bucket_name)
        mem_client.stop_persistence()

        # Perform Create, Update, Delete ops in the CB bucket
        self.log.info("Performing Mutations")
        self.perform_doc_ops_in_all_cb_buckets("create", self.num_items,
                                               self.num_items * 3 / 2)

        kv_nodes = self.get_kv_nodes(self.servers, self.cluster.master)
        items_in_cb_bucket = 0
        if self.where_field and self.where_value:
            items_in_cb_bucket = RestConnection(
                self.cluster.master).query_tool(
                    'select count(*) from %s where %s = "%s"' %
                    (self.cb_bucket_name, self.where_field,
                     self.where_value))['results'][0]['$1']
        else:
            for node in kv_nodes:
                items_in_cb_bucket += self.get_item_count(
                    node, self.cb_bucket_name)
        # Validate no. of items in CBAS dataset
        self.assertTrue(
            self.cbas_util.validate_cbas_dataset_items_count(
                self.cbas_dataset_name, items_in_cb_bucket, 0),
            "No. of items in CBAS dataset do not match that in the CB bucket")

        # Count no. of items in CB & CBAS Buckets
        items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(
            self.cbas_dataset_name)

        self.log.info(
            "Before Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s",
            items_in_cb_bucket, items_in_cbas_bucket)

        self.assertTrue(
            items_in_cb_bucket == items_in_cbas_bucket,
            "Before Rollback : # Items in CBAS bucket does not match that in the CB bucket"
        )

        if self.CC:
            self.cluster_util.remove_node([self.otpNodes[0]],
                                          wait_for_rebalance=False)
            self.cbas_util.closeConn()
            self.cbas_util = CbasUtil(self.cluster.master,
                                      self.cluster.cbas_nodes[0])
            self.cbas_util.createConn("default")
        else:
            self.cluster_util.remove_node([self.otpNodes[1]],
                                          wait_for_rebalance=False)

        # Kill memcached on Node A so that Node B becomes master
        self.log.info("Kill Memcached process on NodeA")
        shell = RemoteMachineShellConnection(self.cluster.master)
        shell.kill_memcached()
        self.sleep(2, "Wait for 2 secs for DCP rollback sent to CBAS.")
        curr = time.time()
        while items_in_cbas_bucket == -1 or (
                items_in_cbas_bucket != 0
                and items_in_cbas_bucket > items_before_persistence_stop):
            try:
                if curr + 120 < time.time():
                    break
                items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(
                    self.cbas_dataset_name)
                self.log.info("Items in CBAS: %s" % items_in_cbas_bucket)
            except:
                self.log.info(
                    "Probably rebalance is in progress and the reason for queries being failing."
                )
                pass
        self.assertTrue(items_in_cbas_bucket <= items_before_persistence_stop,
                        "Roll-back did not happen.")
        self.log.info("#######BINGO########\nROLLBACK HAPPENED")

        items_in_cb_bucket = 0
        curr = time.time()
        while items_in_cb_bucket != items_in_cbas_bucket or items_in_cb_bucket == 0:
            items_in_cb_bucket = 0
            items_in_cbas_bucket = 0
            if self.where_field and self.where_value:
                try:
                    items_in_cb_bucket = RestConnection(
                        self.cluster.master).query_tool(
                            'select count(*) from %s where %s = "%s"' %
                            (self.cb_bucket_name, self.where_field,
                             self.where_value))['results'][0]['$1']
                except:
                    self.log.info(
                        "Indexer in rollback state. Query failed. Pass and move ahead."
                    )
                    pass
            else:
                for node in kv_nodes:
                    items_in_cb_bucket += self.get_item_count(
                        node, self.cb_bucket_name)

            self.log.info("Items in CB bucket after rollback: %s" %
                          items_in_cb_bucket)
            try:
                items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(
                    self.cbas_dataset_name)
            except:
                pass
            if curr + 120 < time.time():
                break
        str_time = time.time()
        while self.rest._rebalance_progress_status(
        ) == "running" and time.time() < str_time + 300:
            self.sleep(1)
            self.log.info("Waiting for rebalance to complete")

        self.log.info(
            "After Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s",
            items_in_cb_bucket, items_in_cbas_bucket)

        self.assertTrue(
            items_in_cb_bucket == items_in_cbas_bucket,
            "After Rollback : # Items in CBAS bucket does not match that in the CB bucket"
        )
Ejemplo n.º 53
0
    def _cluster_setup(self):
        keys_count = self.input.param("keys-count", 0)
        num_buckets = self.input.param("num-buckets", 1)
        bucketType = self.input.param("bucketType", "ephemeral")
        evictionPolicy = self.input.param("evictionPolicy", "noEviction")  # fullEviction

        # master = self.servers[0]
        # credentials = self.input.membase_settings
        rest = RestConnection(self.master)
        info = rest.get_nodes_self()
        rest.init_cluster(username=self.master.rest_username,
                          password=self.master.rest_password)
        memory = min(info.mcdMemoryReserved, self.input.param("kv_memory", 1000))
        rest.init_cluster_memoryQuota(memoryQuota=memory)
        rest.reset_autoreprovision()
        self._add_and_rebalance(self.servers, True)

        if num_buckets == 1:
            bucket_name = "default"
            bucket_ram = info.memoryQuota * 2 // 3
            rest.create_bucket(bucket=bucket_name,
                               ramQuotaMB=bucket_ram,
                               replicaNumber=self.replicas,
                               proxyPort=info.moxi,
                               bucketType=bucketType,
                               evictionPolicy=evictionPolicy)
        else:
            created = BucketOperationHelper.create_multiple_buckets(
                self.master, self.replicas, howmany=num_buckets,
                bucketType=bucketType, evictionPolicy=evictionPolicy)
            self.assertTrue(created, "unable to create multiple buckets")

        buckets = rest.get_buckets()
        for bucket in buckets:
            ready = BucketOperationHelper.wait_for_memcached(self.master, bucket.name)
            self.assertTrue(ready, msg="wait_for_memcached failed")

        for bucket in buckets:
            distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05}
            inserted_count, rejected_count = self.load_bucket_and_return_the_keys(servers=[self.master],
                                                                                name=bucket.name,
                                                                                # ram_load_ratio=0.02,
                                                                                value_size_distribution=distribution,
                                                                                write_only=True,
                                                                                moxi=True,
                                                                                number_of_threads=2,
                                                                                number_of_items=keys_count)
            self.loaded_items[bucket.name] = inserted_count
Ejemplo n.º 54
0
    def test_ingestion_after_kv_rollback_delete_ops(self):
        self.setup_for_test()
        # Stop Persistence on Node A & Node B
        self.log.info("Stopping persistence on NodeA")
        mem_client = MemcachedClientHelper.direct_client(
            self.cluster.master, self.cb_bucket_name)
        mem_client.stop_persistence()

        # Perform Create, Update, Delete ops in the CB bucket
        self.log.info("Performing Mutations")
        self.perform_doc_ops_in_all_cb_buckets("delete", 0, self.num_items / 2)

        kv_nodes = self.get_kv_nodes(self.servers, self.cluster.master)
        items_in_cb_bucket = 0
        if self.where_field and self.where_value:
            items_in_cb_bucket = RestConnection(
                self.cluster.master).query_tool(
                    'select count(*) from %s where %s = "%s"' %
                    (self.cb_bucket_name, self.where_field,
                     self.where_value))['results'][0]['$1']
        else:
            for node in kv_nodes:
                items_in_cb_bucket += self.get_item_count(
                    node, self.cb_bucket_name)
        # Validate no. of items in CBAS dataset
        self.assertTrue(
            self.cbas_util.validate_cbas_dataset_items_count(
                self.cbas_dataset_name, items_in_cb_bucket, 0),
            "No. of items in CBAS dataset do not match that in the CB bucket")

        # Count no. of items in CB & CBAS Buckets
        items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(
            self.cbas_dataset_name)
        items_before_rollback = items_in_cbas_bucket
        self.log.info(
            "Before Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s",
            items_in_cb_bucket, items_in_cbas_bucket)

        self.assertTrue(
            items_in_cb_bucket == items_in_cbas_bucket,
            "Before Rollback : # Items in CBAS bucket does not match that in the CB bucket"
        )

        # Kill memcached on Node A so that Node B becomes master
        self.log.info("Kill Memcached process on NodeA")
        shell = RemoteMachineShellConnection(self.cluster.master)
        shell.kill_memcached()
        self.sleep(2, "Wait for 2 secs for DCP rollback sent to CBAS.")
        curr = time.time()
        while items_in_cbas_bucket != 0 and items_in_cbas_bucket <= items_before_rollback:
            items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(
                self.cbas_dataset_name)
            if curr + 120 < time.time():
                break
        self.assertTrue(items_in_cbas_bucket > items_before_rollback,
                        "Roll-back did not happen.")
        self.log.info("#######BINGO########\nROLLBACK HAPPENED")

        items_in_cb_bucket = 0
        curr = time.time()
        while items_in_cb_bucket != items_in_cbas_bucket:
            items_in_cb_bucket = 0
            items_in_cbas_bucket = 0
            if self.where_field and self.where_value:
                try:
                    items_in_cb_bucket = RestConnection(
                        self.cluster.master).query_tool(
                            'select count(*) from %s where %s = "%s"' %
                            (self.cb_bucket_name, self.where_field,
                             self.where_value))['results'][0]['$1']
                except:
                    self.log.info(
                        "Indexer in rollback state. Query failed. Pass and move ahead."
                    )
                    pass
            else:
                for node in kv_nodes:
                    items_in_cb_bucket += self.get_item_count(
                        node, self.cb_bucket_name)

            self.log.info("Items in CB bucket after rollback: %s" %
                          items_in_cb_bucket)
            items_in_cbas_bucket, _ = self.cbas_util.get_num_items_in_cbas_dataset(
                self.cbas_dataset_name)
            if curr + 120 < time.time():
                break

        self.log.info(
            "After Rollback --- # docs in CB bucket : %s, # docs in CBAS bucket : %s",
            items_in_cb_bucket, items_in_cbas_bucket)

        self.assertTrue(
            items_in_cb_bucket == items_in_cbas_bucket,
            "After Rollback : # Items in CBAS bucket does not match that in the CB bucket"
        )
Ejemplo n.º 55
0
class AutoReprovisionTests(unittest.TestCase):
    def setUp(self):
        self.input = TestInputSingleton.input
        self.case_number = self.input.param("case_number", 0)
        self.use_master = self.input.param("use_master", False)
        self.skip_services = self.input.param("skip_services", True)
        self.replicas = self.input.param("replicas", 1)
        self.servers = self.input.servers
        self.log = logger.Logger().get_logger()
        self.master = self.servers[0]
        self.rest = RestConnection(self.master)
        self.timeout = 60
        self.loaded_items = dict()
        AutoReprovisionBaseTest.common_setup(self.input, self)
        self._cluster_setup()
        if self.use_master:
            self.server_fail = self.servers[0]
            self.master = self.servers[1]
        else:
            self.server_fail = self.servers[1]
        for server in self.servers:
            remote = RemoteMachineShellConnection(server)
            output, error = remote.enable_diag_eval_on_non_local_hosts()

    def tearDown(self):
        AutoReprovisionBaseTest.common_tearDown(self.servers, self)

    def sleep(self, timeout=1, message=""):
        self.log.info("sleep for {0} secs. {1} ...".format(timeout, message))
        time.sleep(timeout)

    def test_default_values(self):
        # read settings and verify
        settings = self.rest.get_autoreprovision_settings()
        self.assertEqual(settings.enabled, True)
        self.assertEqual(settings.max_nodes, 1)
        self.assertEqual(settings.count, 0)

    def test_enable(self):
        status = self.rest.update_autoreprovision_settings(True, 2)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        # read settings and verify
        settings = self.rest.get_autoreprovision_settings()
        self.assertEqual(settings.enabled, True)
        self.assertEqual(settings.max_nodes, 2)
        self.assertEqual(settings.count, 0)

    def test_disable(self):
        status = self.rest.update_autoreprovision_settings(False, 2)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        # read settings and verify
        settings = self.rest.get_autoreprovision_settings()
        self.assertEqual(settings.enabled, False)
        self.assertEqual(settings.max_nodes, 1)
        self.assertEqual(settings.count, 0)

    def test_valid_max_nodes(self):
        max_nodes = [1, 2, 5, 10, 100]  # 0?
        for max_node in max_nodes:
            status = self.rest.update_autoreprovision_settings(True, max_node)
            if not status:
                self.fail('failed to change autoreprovision_settings!')
            # read settings and verify
            settings = self.rest.get_autoreprovision_settings()
            self.assertEqual(settings.max_nodes, max_node)

    def test_node_firewall_enabled(self):
        timeout = self.timeout // 2

        status = self.rest.update_autoreprovision_settings(True, 1)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        RemoteUtilHelper.enable_firewall(self.server_fail)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        self.sleep(5)
        shell = RemoteMachineShellConnection(self.server_fail)
        shell.disable_firewall()
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())
        buckets = self.rest.get_buckets()
        for bucket in buckets:
            self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])

    def test_node_stop(self):
        timeout = self.timeout // 2
        status = self.rest.update_autoreprovision_settings(True, 1)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        self._stop_couchbase(self.server_fail)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        self.sleep(30)
        self._start_couchbase(self.server_fail)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        self.sleep(10)
        helper = RestHelper(self.rest)
        self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
        self.assertFalse(helper.is_cluster_rebalanced(), "cluster is not balanced")
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())
        buckets = self.rest.get_buckets()
        for bucket in buckets:
            self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])

    def test_node_cb_restart(self):
        timeout = self.timeout // 2
        status = self.rest.update_autoreprovision_settings(True, 1)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        shell = RemoteMachineShellConnection(self.server_fail)
        shell.restart_couchbase()
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        self.sleep(5)
        helper = RestHelper(self.rest)
        self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
        self.assertFalse(helper.is_cluster_rebalanced(), "cluster is not balanced")
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())
        buckets = self.rest.get_buckets()
        for bucket in buckets:
            self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])

    def test_node_reboot(self):
        wait_timeout = 120
        timeout = self.timeout // 2
        status = self.rest.update_autoreprovision_settings(True, 1)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        shell = RemoteMachineShellConnection(self.server_fail)
        if shell.extract_remote_info().type.lower() == 'windows':
            o, r = shell.execute_command("shutdown -r -f -t 0")
        elif shell.extract_remote_info().type.lower() == 'linux':
            o, r = shell.execute_command("reboot")
        shell.log_command_output(o, r)
        time.sleep(wait_timeout * 5)
        # disable firewall on the node
        shell = RemoteMachineShellConnection(self.server_fail)
        shell.disable_firewall()
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        helper = RestHelper(self.rest)
        self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
        self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())
        buckets = self.rest.get_buckets()
        for bucket in buckets:
            self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])

    def test_firewall_node_when_autoreprovisioning(self):
        wait_timeout = 120
        before = self.input.param("before", True)
        timeout = self.timeout // 2
        status = self.rest.update_autoreprovision_settings(True, 1)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        shell = RemoteMachineShellConnection(self.server_fail)
        if shell.extract_remote_info().type.lower() == 'windows':
            o, r = shell.execute_command("shutdown -r -f -t 0")
        elif shell.extract_remote_info().type.lower() == 'linux':
            o, r = shell.execute_command("reboot")
        shell.log_command_output(o, r)
        if shell.extract_remote_info().type.lower() == 'windows':
            time.sleep(wait_timeout * 5)
        else:
            time.sleep(wait_timeout)
        # disable firewall on the node
        shell = RemoteMachineShellConnection(self.server_fail)
        shell.disable_firewall()
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        helper = RestHelper(self.rest)
        self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
        self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
        # self.sleep(5)
        if before:
            RemoteUtilHelper.enable_firewall(self.servers[2])
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        if not before:
            RemoteUtilHelper.enable_firewall(self.servers[2])
        # self.sleep(5)
        try:
            self.rest.monitorRebalance()
            self.fail("Rebalance failed expected")
        except RebalanceFailedException:
            self.log.info("Rebalance failed but it's expected")
        shell = RemoteMachineShellConnection(self.servers[2])
        shell.disable_firewall()
        self.sleep(5)
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())
        buckets = self.rest.get_buckets()
        for bucket in buckets:
            self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])

    def test_node_memcached_failure(self):
        timeout = self.timeout // 2
        status = self.rest.update_autoreprovision_settings(True, 1)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        self._pause_couchbase(self.server_fail)
        self.sleep(5)
        AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1,
                                                          timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                          self)
        RemoteUtilHelper.common_basic_setup([self.server_fail])
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        helper = RestHelper(self.rest)
        self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
        self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced")
        buckets = self.rest.get_buckets()
        for bucket in buckets:
            self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])

    def test_two_failed_nodes(self):
        timeout = self.timeout // 2
        server_fail1 = self.servers[1]
        server_fail2 = self.servers[2]
        status = self.rest.update_autoreprovision_settings(True, 1)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        self.log.info("stopping the first server")
        self._stop_couchbase(server_fail1)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)

        self.log.info("stopping the second server")
        self._stop_couchbase(server_fail2)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        helper = RestHelper(self.rest)
        self.assertFalse(helper.is_cluster_healthy(), "cluster status is healthy")
        self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced")
        self._start_couchbase(server_fail1)

        self._start_couchbase(server_fail1)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        self._start_couchbase(server_fail2)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        self.sleep(20)
        helper = RestHelper(self.rest)
        self.assertTrue(helper.is_cluster_healthy(), "cluster status is healthy")
        self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())

    def test_reset_count(self):
        timeout = self.timeout // 2
        server_fail1 = self.servers[1]
        server_fail2 = self.servers[2]
        status = self.rest.update_autoreprovision_settings(True, 2)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        self.log.info("stopping the first server")
        self._stop_couchbase(server_fail1)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)

        self.log.info("resetting the autoreprovision count")
        if not self.rest.reset_autoreprovision():
            self.fail('failed to reset autoreprovision count!')

        self.log.info("stopping the second server")
        self._stop_couchbase(server_fail2)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)

        settings = self.rest.get_autoreprovision_settings()
        self.assertEqual(settings.enabled, True)
        self.assertEqual(settings.max_nodes, 2)
        self.assertEqual(settings.count, 0)

        self._start_couchbase(server_fail2)
        self._start_couchbase(server_fail1)
        self.sleep(30)
        settings = self.rest.get_autoreprovision_settings()
        self.assertEqual(settings.enabled, True)
        self.assertEqual(settings.max_nodes, 2)
        self.assertEqual(settings.count, 2)
        self.log.info("resetting the autoreprovision count")
        if not self.rest.reset_autoreprovision():
            self.fail('failed to reset autoreprovision count!')
        settings = self.rest.get_autoreprovision_settings()
        self.assertEqual(settings.enabled, True)
        self.assertEqual(settings.max_nodes, 2)
        self.assertEqual(settings.count, 0)

        helper = RestHelper(self.rest)
        self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
        self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())

    def test_invalid_max_nodes(self):
        max_nodes = [-30, -2, 0.5, 'df', True]  # 3000000000 ?
        for max_node in max_nodes:
            status = self.rest.update_autoreprovision_settings(True, max_node)
            if status:
                self.fail('autoreprovision_settings have been changed incorrectly!')
            # read settings and verify
            settings = self.rest.get_autoreprovision_settings()
            self.assertEqual(settings.max_nodes, 1)

    def test_node_memcached_failure_in_series(self):
        timeout = self.timeout // 2
        status = self.rest.update_autoreprovision_settings(True, 1)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        data_lost = False
        for i in reversed(range(len(self.servers))):
            print(self.servers[i])
            operation = random.choice(['stop', 'memcached_failure', 'restart', 'failover', 'reboot'])
            shell = RemoteMachineShellConnection(self.servers[i])
            print("operation", operation)
            if i == 0:
                self.master = self.servers[1]
            if operation == 'stop':
                self._stop_couchbase(self.servers[i])
            elif operation == 'memcached_failure':
                self._pause_couchbase(self.servers[i])
            elif operation == 'restart':
                shell.restart_couchbase()
            elif operation == 'failover':
                RemoteUtilHelper.enable_firewall(self.servers[i])
            elif operation == 'reboot':
                if shell.extract_remote_info().type.lower() == 'windows':
                    o, r = shell.execute_command("shutdown -r -f -t 0")
                    self.sleep(200)
                elif shell.extract_remote_info().type.lower() == 'linux':
                    o, r = shell.execute_command("reboot")
                shell.log_command_output(o, r)
                self.sleep(60)
            self.sleep(40)
            if operation == 'memcached_failure':
                AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1,
                                                                  timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                                  self)
            if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot':
                AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                                    timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                                    self)
            if operation != 'restart':
                RemoteUtilHelper.common_basic_setup([self.servers[i]])
            AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                                timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                                self)
            helper = RestHelper(RestConnection(self.master))
            self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
            self.sleep(40)
            if operation == 'memcached_failure' or operation == 'failover':
                self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced")
            else:
                if 'kv' in self.servers[i].services and self.replicas > 0:
                    self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
                    self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
                    self.assertTrue(self.rest.monitorRebalance())
                else:
                    self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced")
            buckets = self.rest.get_buckets()
            if self.replicas == 0 and (operation == 'restart' or operation == 'reboot'):
                data_lost = True
            for bucket in buckets:
                if not data_lost:
                    self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])

    def test_ui_logs(self):
        timeout = self.timeout // 2
        server_fail1 = self.servers[1]
        server_fail2 = self.servers[2]
        status = self.rest.update_autoreprovision_settings(True, 2)
        if not status:
            self.fail('failed to change autoreprovision_settings!')
        self.sleep(5)
        logs = self.rest.get_logs(5)
        self.assertTrue('Enabled auto-reprovision config with max_nodes set to 2' in [l['text'] for l in logs])

        self.log.info("stopping the first server")
        self._stop_couchbase(server_fail1)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)

        self.log.info("resetting the autoreprovision count")
        if not self.rest.reset_autoreprovision():
            self.fail('failed to reset autoreprovision count!')
        logs = self.rest.get_logs(5)
        self.assertTrue('auto-reprovision count reset from 0' in [l['text'] for l in logs])

        self.log.info("stopping the second server")
        self._stop_couchbase(server_fail2)
        AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 2,
                                                            timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                            self)
        settings = self.rest.get_autoreprovision_settings()
        self.assertEqual(settings.enabled, True)
        self.assertEqual(settings.max_nodes, 2)
        self.assertEqual(settings.count, 0)
        self._start_couchbase(server_fail2)
        self._start_couchbase(server_fail1)
        self.sleep(30)
        settings = self.rest.get_autoreprovision_settings()
        self.assertEqual(settings.enabled, True)
        self.assertEqual(settings.max_nodes, 2)
        self.assertEqual(settings.count, 2)
        logs = self.rest.get_logs(5)
        self.assertTrue('auto-reprovision is disabled as maximum number of nodes (2) '
                        'that can be auto-reprovisioned has been reached.' in [l['text'] for l in logs])

        self.log.info("resetting the autoreprovision count")
        if not self.rest.reset_autoreprovision():
            self.fail('failed to reset autoreprovision count!')
        settings = self.rest.get_autoreprovision_settings()
        self.assertEqual(settings.enabled, True)
        self.assertEqual(settings.max_nodes, 2)
        self.assertEqual(settings.count, 0)
        logs = self.rest.get_logs(5)
        self.assertTrue('auto-reprovision count reset from 2' in [l['text'] for l in logs])

        helper = RestHelper(self.rest)
        self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
        self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
        self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
        self.assertTrue(self.rest.monitorRebalance())
        logs = self.rest.get_logs(5)
        # https://issues.couchbase.com/browse/MB-24520
        self.assertFalse('Reset auto-failover count' in [l['text'] for l in logs])
        self.assertTrue('Rebalance completed successfully.' in [l['text'] for l in logs])

    def _stop_couchbase(self, server):
        shell = RemoteMachineShellConnection(server)
        shell.stop_couchbase()
        shell.disconnect()
        log.info("stopped couchbase server on {0}".format(server))

    def _start_couchbase(self, server):
        shell = RemoteMachineShellConnection(server)
        shell.start_couchbase()
        shell.disconnect()
        log.info("srarted couchbase server on {0}".format(server))

    # the signals SIGSTOP and SIGCONT do not exist for Windows
    def _pause_couchbase(self, server):
        self._pause_beam(server)
        self._pause_memcached(server)

    # the signals SIGSTOP and SIGCONT do not exist for Windows
    def _pause_memcached(self, server):
        shell = RemoteMachineShellConnection(server)
        shell.pause_memcached()
        shell.disconnect()
        log.info("stopped couchbase server on {0}".format(server))

    # the signals SIGSTOP and SIGCONT do not exist for Windows
    def _pause_beam(self, server):
        shell = RemoteMachineShellConnection(server)
        shell.pause_beam()
        shell.disconnect()
        log.info("stopped couchbase server on {0}".format(server))

    @staticmethod
    def load_bucket_and_return_the_keys(servers=None,
                                        name='default',
                                        ram_load_ratio=-1,
                                        number_of_items=-1,
                                        value_size_distribution=None,
                                        number_of_threads=1,
                                        override_vBucketId=-1,
                                        write_only=False,
                                        moxi=True,
                                        delete_ratio=0,
                                        expiry_ratio=0):
        inserted_keys = []
        rejected_keys = []
        log = logger.Logger.get_logger()
        threads = MemcachedClientHelper.create_threads(servers,
                                                       name,
                                                       ram_load_ratio,
                                                       number_of_items,
                                                       value_size_distribution,
                                                       number_of_threads,
                                                       override_vBucketId,
                                                       write_only=write_only,
                                                       moxi=moxi,
                                                       delete_ratio=delete_ratio,
                                                       expiry_ratio=expiry_ratio)

        # we can start them!
        for thread in threads:
            thread.start()
        log.info("waiting for all worker thread to finish their work...")
        [thread.join() for thread in threads]
        log.info("worker threads are done...")

        inserted_count = 0
        rejected_count = 0
        deleted_count = 0
        expired_count = 0
        for thread in threads:
            t_inserted, t_rejected = thread.keys_set()
            inserted_count += thread.inserted_keys_count()
            rejected_count += thread.rejected_keys_count()
            deleted_count += thread._delete_count
            expired_count += thread._expiry_count
            inserted_keys.extend(t_inserted)
            rejected_keys.extend(t_rejected)
        msg = "inserted keys count : {0} , rejected keys count : {1}"
        log.info(msg.format(inserted_count, rejected_count))
        msg = "deleted keys count : {0} , expired keys count : {1}"
        log.info(msg.format(deleted_count, expired_count))
        return inserted_count, rejected_count

    def verify_loaded_data(self, master, bucket, inserted_count):
        rest = RestConnection(master)
        bucket_count_map = rest.get_buckets_itemCount()
        msg = "Before count : {0} , After count : {1}"
        log.info(msg.format(inserted_count, bucket_count_map[bucket]))
        self.assertEqual(bucket_count_map[bucket], inserted_count,
                          msg="unable to verify keys after restore")

    def _cluster_setup(self):
        keys_count = self.input.param("keys-count", 0)
        num_buckets = self.input.param("num-buckets", 1)
        bucketType = self.input.param("bucketType", "ephemeral")
        evictionPolicy = self.input.param("evictionPolicy", "noEviction")  # fullEviction

        # master = self.servers[0]
        # credentials = self.input.membase_settings
        rest = RestConnection(self.master)
        info = rest.get_nodes_self()
        rest.init_cluster(username=self.master.rest_username,
                          password=self.master.rest_password)
        memory = min(info.mcdMemoryReserved, self.input.param("kv_memory", 1000))
        rest.init_cluster_memoryQuota(memoryQuota=memory)
        rest.reset_autoreprovision()
        self._add_and_rebalance(self.servers, True)

        if num_buckets == 1:
            bucket_name = "default"
            bucket_ram = info.memoryQuota * 2 // 3
            rest.create_bucket(bucket=bucket_name,
                               ramQuotaMB=bucket_ram,
                               replicaNumber=self.replicas,
                               proxyPort=info.moxi,
                               bucketType=bucketType,
                               evictionPolicy=evictionPolicy)
        else:
            created = BucketOperationHelper.create_multiple_buckets(
                self.master, self.replicas, howmany=num_buckets,
                bucketType=bucketType, evictionPolicy=evictionPolicy)
            self.assertTrue(created, "unable to create multiple buckets")

        buckets = rest.get_buckets()
        for bucket in buckets:
            ready = BucketOperationHelper.wait_for_memcached(self.master, bucket.name)
            self.assertTrue(ready, msg="wait_for_memcached failed")

        for bucket in buckets:
            distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05}
            inserted_count, rejected_count = self.load_bucket_and_return_the_keys(servers=[self.master],
                                                                                name=bucket.name,
                                                                                # ram_load_ratio=0.02,
                                                                                value_size_distribution=distribution,
                                                                                write_only=True,
                                                                                moxi=True,
                                                                                number_of_threads=2,
                                                                                number_of_items=keys_count)
            self.loaded_items[bucket.name] = inserted_count

    def _add_and_rebalance(self, servers, wait_for_rebalance=True):
        log = logger.Logger.get_logger()
        master = servers[0]
        all_nodes_added = True
        rebalanced = True
        rest = RestConnection(master)
        if len(servers) > 1:
            for serverInfo in servers[1:]:
                log.info('adding {0} node : {1}:{2} to the cluster'.format(
                    serverInfo.services, serverInfo.ip, serverInfo.port))
                otpNode = rest.add_node(master.rest_username, master.rest_password, serverInfo.ip, port=serverInfo.port,
                                        services=["kv"])
                if otpNode:
                    log.info('added node : {0} to the cluster'.format(otpNode.id))
                else:
                    all_nodes_added = False
                    break
            if all_nodes_added:
                rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], ejectedNodes=[])
                if wait_for_rebalance:
                    rebalanced &= rest.monitorRebalance()
                else:
                    rebalanced = False
        return all_nodes_added and rebalanced
Ejemplo n.º 56
0
class AutoFailoverTests(unittest.TestCase):
    def setUp(self):
        self.input = TestInputSingleton.input
        self.case_number = self.input.param("case_number", 0)
        self.servers = self.input.servers
        self.log = logger.Logger().get_logger()
        self.master = self.servers[0]
        self.rest = RestConnection(self.master)
        self.timeout = 60
        AutoFailoverBaseTest.common_setup(self.input, self)
        self._cluster_setup()

    def tearDown(self):
        AutoFailoverBaseTest.common_tearDown(self.servers, self)

    def sleep(self, timeout=1, message=""):
        self.log.info("sleep for {0} secs. {1} ...".format(timeout, message))
        time.sleep(timeout)

    def test_enable(self):
        status = self.rest.update_autofailover_settings(True, self.timeout / 2)
        if not status:
            self.fail('failed to change autofailover_settings! See MB-7282')
        #read settings and verify
        settings = self.rest.get_autofailover_settings()
        self.assertEquals(settings.enabled, True)

    def test_disable(self):
        status = self.rest.update_autofailover_settings(False, self.timeout)
        if not status:
            self.fail('failed to change autofailover_settings! See MB-7282')
        #read settings and verify
        settings = self.rest.get_autofailover_settings()
        self.assertEquals(settings.enabled, False)

    def test_valid_timeouts(self):
        timeouts = [30, 31, 300, 3600]
        for timeout in timeouts:
            status = self.rest.update_autofailover_settings(True, timeout)
            if not status:
                self.fail(
                    'failed to change autofailover_settings! See MB-7282')
            #read settings and verify
            settings = self.rest.get_autofailover_settings()
            self.assertTrue(settings.timeout == timeout)

    def test_30s_timeout_firewall(self):
        timeout = self.timeout / 2
        server_fail = self.servers[1]
        status = self.rest.update_autofailover_settings(True, timeout)
        if not status:
            self.fail('failed to change autofailover_settings! See MB-7282')
        self.sleep(5)
        RemoteUtilHelper.enable_firewall(server_fail)
        AutoFailoverBaseTest.wait_for_failover_or_assert(
            self.master, 1,
            timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)

    def test_60s_timeout_firewall(self):
        timeout = self.timeout
        server_fail = self.servers[1]
        status = self.rest.update_autofailover_settings(True, timeout)
        if not status:
            self.fail('failed to change autofailover_settings! See MB-7282')
        self.sleep(5)
        RemoteUtilHelper.enable_firewall(server_fail)
        AutoFailoverBaseTest.wait_for_failover_or_assert(
            self.master, 1,
            timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)

    def test_30s_timeout_stop(self):
        timeout = self.timeout
        server_fail = self.servers[1]
        status = self.rest.update_autofailover_settings(True, timeout)
        if not status:
            self.fail('failed to change autofailover_settings! See MB-7282')
        self.sleep(5)
        self._stop_couchbase(server_fail)
        AutoFailoverBaseTest.wait_for_failover_or_assert(
            self.master, 1,
            timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)

    def test_60s_timeout_stop(self):
        timeout = self.timeout
        server_fail = self.servers[1]
        status = self.rest.update_autofailover_settings(True, timeout)
        if not status:
            self.fail('failed to change autofailover_settings! See MB-7282')
        self.sleep(5)
        self._stop_couchbase(server_fail)
        AutoFailoverBaseTest.wait_for_failover_or_assert(
            self.master, 1,
            timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)

    def test_reset_count(self):
        timeout = self.timeout / 2
        server_fail1 = self.servers[1]
        server_fail2 = self.servers[2]
        status = self.rest.update_autofailover_settings(True, timeout)
        if not status:
            self.fail('failed to change autofailover_settings! See MB-7282')
        self.sleep(5)
        self.log.info("stopping the first server")
        self._stop_couchbase(server_fail1)
        AutoFailoverBaseTest.wait_for_failover_or_assert(
            self.master, 1,
            timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)

        self.log.info("resetting the autofailover count")
        if not self.rest.reset_autofailover():
            self.fail('failed to reset autofailover count!')

        self.log.info("stopping the second server")
        self._stop_couchbase(server_fail2)
        AutoFailoverBaseTest.wait_for_failover_or_assert(
            self.master, 1,
            timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)

        self.log.info("resetting the autofailover count")
        if not self.rest.reset_autofailover():
            self.fail('failed to reset autofailover count!')

    def test_30s_timeout_pause(self):
        timeout = self.timeout / 2
        server_fail = self.servers[1]
        shell = RemoteMachineShellConnection(server_fail)
        type = shell.extract_remote_info().distribution_type
        shell.disconnect()
        if type.lower() == 'windows':
            self.log.info(
                "test will be skipped because the signals SIGSTOP and SIGCONT do not exist for Windows"
            )
            return
        status = self.rest.update_autofailover_settings(True, timeout)
        if not status:
            self.fail('failed to change autofailover_settings! See MB-7282')
        self.sleep(5)
        self._pause_couchbase(server_fail)
        AutoFailoverBaseTest.wait_for_failover_or_assert(
            self.master, 1,
            timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)

    def test_60s_timeout_pause(self):
        timeout = self.timeout
        server_fail = self.servers[1]
        shell = RemoteMachineShellConnection(server_fail)
        type = shell.extract_remote_info().distribution_type
        shell.disconnect()
        if type.lower() == 'windows':
            self.log.info(
                "test will be skipped because the signals SIGSTOP and SIGCONT do not exist for Windows"
            )
            return
        status = self.rest.update_autofailover_settings(True, timeout)
        if not status:
            self.fail('failed to change autofailover_settings! See MB-7282')
        self.sleep(5)
        self._pause_couchbase(server_fail)
        AutoFailoverBaseTest.wait_for_failover_or_assert(
            self.master, 1,
            timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)

    def test_invalid_timeouts(self):
        timeouts = [-360, -60, 0, 15, 29, 300000]
        for timeout in timeouts:
            status = self.rest.update_autofailover_settings(True, timeout)
            if status:
                self.fail(
                    'autofailover_settings have been changed incorrectly!')
            #read settings and verify
            settings = self.rest.get_autofailover_settings()
            self.assertTrue(settings.timeout >= 30)

    def test_two_failed_nodes(self):
        timeout = self.timeout / 2
        server_fail1 = self.servers[1]
        server_fail2 = self.servers[2]
        status = self.rest.update_autofailover_settings(True, timeout)
        if not status:
            self.fail('failed to change autofailover_settings! See MB-7282')
        self.sleep(5)
        self.log.info("stopping the first server")
        self._stop_couchbase(server_fail1)
        AutoFailoverBaseTest.wait_for_failover_or_assert(
            self.master, 1,
            timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)

        self.log.info("stopping the second server")
        self._stop_couchbase(server_fail2)
        AutoFailoverBaseTest.wait_for_no_failover_or_assert(
            self.master, 2,
            timeout + AutoFailoverBaseTest.MAX_FAIL_DETECT_TIME, self)

    def _stop_couchbase(self, server):
        shell = RemoteMachineShellConnection(server)
        shell.stop_couchbase()
        shell.disconnect()
        log.info("stopped couchbase server on {0}".format(server))

    #the signals SIGSTOP and SIGCONT do not exist for Windows
    def _pause_couchbase(self, server):
        self._pause_beam(server)
        self._pause_memcached(server)

    #the signals SIGSTOP and SIGCONT do not exist for Windows
    def _pause_memcached(self, server):
        shell = RemoteMachineShellConnection(server)
        shell.pause_memcached()
        shell.disconnect()
        log.info("stopped couchbase server on {0}".format(server))

    #the signals SIGSTOP and SIGCONT do not exist for Windows
    def _pause_beam(self, server):
        shell = RemoteMachineShellConnection(server)
        shell.pause_beam()
        shell.disconnect()
        log.info("stopped couchbase server on {0}".format(server))

    def load_data(self, master, bucket, keys_count):
        inserted_keys_cnt = 0
        repeat_count = 0
        while inserted_keys_cnt < keys_count and repeat_count < 5:
            keys_cnt, rejected_keys_cnt = \
            MemcachedClientHelper.load_bucket(servers=[master],
                name=bucket,
                number_of_items=keys_count,
                number_of_threads=5,
                write_only=True)
            inserted_keys_cnt += keys_cnt
            if keys_cnt == 0:
                repeat_count += 1
            else:
                repeat_count = 0
        if repeat_count == 5:
            log.exception("impossible to load data")
        log.info("wait until data is completely persisted on the disk")
        RebalanceHelper.wait_for_persistence(master, bucket)
        return inserted_keys_cnt

    def _cluster_setup(self):
        replicas = self.input.param("replicas", 1)
        keys_count = self.input.param("keys-count", 0)
        num_buckets = self.input.param("num-buckets", 1)

        bucket_name = "default"
        master = self.servers[0]
        credentials = self.input.membase_settings
        rest = RestConnection(self.master)
        info = rest.get_nodes_self()
        rest.init_cluster(username=self.master.rest_username,
                          password=self.master.rest_password)
        rest.init_cluster_memoryQuota(memoryQuota=info.mcdMemoryReserved)
        rest.reset_autofailover()
        ClusterOperationHelper.add_and_rebalance(self.servers, True)

        if num_buckets == 1:
            bucket_ram = info.memoryQuota * 2 / 3
            rest.create_bucket(bucket=bucket_name,
                               ramQuotaMB=bucket_ram,
                               replicaNumber=replicas,
                               proxyPort=info.moxi)
        else:
            created = BucketOperationHelper.create_multiple_buckets(
                self.master, replicas, howmany=num_buckets)
            self.assertTrue(created, "unable to create multiple buckets")

        buckets = rest.get_buckets()
        for bucket in buckets:
            ready = BucketOperationHelper.wait_for_memcached(
                self.master, bucket.name)
            self.assertTrue(ready, msg="wait_for_memcached failed")

        for bucket in buckets:
            inserted_keys_cnt = self.load_data(self.master, bucket.name,
                                               keys_count)
            log.info('inserted {0} keys'.format(inserted_keys_cnt))
Ejemplo n.º 57
0
 def rebalance_out_with_failover(self):
     fail_over = self.input.param("fail_over", False)
     self.rest = RestConnection(self.master)
     gen_delete = BlobGenerator('mike',
                                'mike-',
                                self.value_size,
                                start=self.num_items / 2,
                                end=self.num_items)
     gen_create = BlobGenerator('mike',
                                'mike-',
                                self.value_size,
                                start=self.num_items + 1,
                                end=self.num_items * 3 / 2)
     # define which doc's ops will be performed during rebalancing
     # allows multiple of them but one by one
     tasks = []
     if (self.doc_ops is not None):
         if ("update" in self.doc_ops):
             tasks += self._async_load_all_buckets(self.master,
                                                   self.gen_update,
                                                   "update", 0)
         if ("create" in self.doc_ops):
             tasks += self._async_load_all_buckets(self.master, gen_create,
                                                   "create", 0)
         if ("delete" in self.doc_ops):
             tasks += self._async_load_all_buckets(self.master, gen_delete,
                                                   "delete", 0)
         for task in tasks:
             task.result()
     ejectedNode = self.find_node_info(self.master,
                                       self.servers[self.nodes_init - 1])
     self._verify_stats_all_buckets(self.servers[:self.num_servers],
                                    timeout=120)
     self._wait_for_stats_all_buckets(self.servers[:self.num_servers])
     self.sleep(20)
     prev_failover_stats = self.get_failovers_logs(
         self.servers[:self.nodes_init], self.buckets)
     prev_vbucket_stats = self.get_vbucket_seqnos(
         self.servers[:self.nodes_init], self.buckets)
     record_data_set = self.get_data_set_all(self.servers[:self.nodes_init],
                                             self.buckets)
     self.compare_vbucketseq_failoverlogs(prev_vbucket_stats,
                                          prev_failover_stats)
     self.rest = RestConnection(self.master)
     chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
     new_server_list = self.add_remove_servers(
         self.servers, self.servers[:self.nodes_init],
         [self.servers[self.nodes_init - 1], chosen[0]], [])
     # Mark Node for failover
     success_failed_over = self.rest.fail_over(chosen[0].id,
                                               graceful=fail_over)
     self.nodes = self.rest.node_statuses()
     self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                         ejectedNodes=[chosen[0].id, ejectedNode.id])
     self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True),
                     msg="Rebalance failed")
     self.verify_cluster_stats(new_server_list,
                               check_ep_items_remaining=True)
     self.sleep(30)
     self.data_analysis_all(record_data_set, new_server_list, self.buckets)
     self.verify_unacked_bytes_all_buckets()
     nodes = self.get_nodes_in_cluster(self.master)
     self.vb_distribution_analysis(servers=nodes,
                                   buckets=self.buckets,
                                   std=1.0,
                                   total_vbuckets=self.total_vbuckets)
Ejemplo n.º 58
0
 def test_node_memcached_failure_in_series(self):
     timeout = self.timeout // 2
     status = self.rest.update_autoreprovision_settings(True, 1)
     if not status:
         self.fail('failed to change autoreprovision_settings!')
     self.sleep(5)
     data_lost = False
     for i in reversed(range(len(self.servers))):
         print(self.servers[i])
         operation = random.choice(['stop', 'memcached_failure', 'restart', 'failover', 'reboot'])
         shell = RemoteMachineShellConnection(self.servers[i])
         print("operation", operation)
         if i == 0:
             self.master = self.servers[1]
         if operation == 'stop':
             self._stop_couchbase(self.servers[i])
         elif operation == 'memcached_failure':
             self._pause_couchbase(self.servers[i])
         elif operation == 'restart':
             shell.restart_couchbase()
         elif operation == 'failover':
             RemoteUtilHelper.enable_firewall(self.servers[i])
         elif operation == 'reboot':
             if shell.extract_remote_info().type.lower() == 'windows':
                 o, r = shell.execute_command("shutdown -r -f -t 0")
                 self.sleep(200)
             elif shell.extract_remote_info().type.lower() == 'linux':
                 o, r = shell.execute_command("reboot")
             shell.log_command_output(o, r)
             self.sleep(60)
         self.sleep(40)
         if operation == 'memcached_failure':
             AutoReprovisionBaseTest.wait_for_warmup_or_assert(self.master, 1,
                                                               timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                               self)
         if operation != 'restart' and operation != 'memcached_failure' and operation != 'reboot':
             AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 1,
                                                                 timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                                 self)
         if operation != 'restart':
             RemoteUtilHelper.common_basic_setup([self.servers[i]])
         AutoReprovisionBaseTest.wait_for_failover_or_assert(self.master, 0,
                                                             timeout + AutoReprovisionBaseTest.MAX_FAIL_DETECT_TIME,
                                                             self)
         helper = RestHelper(RestConnection(self.master))
         self.assertTrue(helper.is_cluster_healthy(), "cluster status is not healthy")
         self.sleep(40)
         if operation == 'memcached_failure' or operation == 'failover':
             self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced")
         else:
             if 'kv' in self.servers[i].services and self.replicas > 0:
                 self.assertFalse(helper.is_cluster_rebalanced(), "cluster is balanced")
                 self.rest.rebalance(otpNodes=[node.id for node in self.rest.node_statuses()], ejectedNodes=[])
                 self.assertTrue(self.rest.monitorRebalance())
             else:
                 self.assertTrue(helper.is_cluster_rebalanced(), "cluster is not balanced")
         buckets = self.rest.get_buckets()
         if self.replicas == 0 and (operation == 'restart' or operation == 'reboot'):
             data_lost = True
         for bucket in buckets:
             if not data_lost:
                 self.verify_loaded_data(self.master, bucket.name, self.loaded_items[bucket.name])
Ejemplo n.º 59
0
class RebalanceOutTests(RebalanceBaseTest):
    def setUp(self):
        super(RebalanceOutTests, self).setUp()

    def tearDown(self):
        super(RebalanceOutTests, self).tearDown()

    """Rebalances nodes out of a cluster while doing docs ops:create, delete, update.

    This test begins with all servers clustered together and  loads a user defined
    number of items into the cluster. Before rebalance we perform docs ops(add/remove/update/read)
    in the cluster( operate with a half of items that were loaded before).It then remove nodes_out
    from the cluster at a time and rebalances.  Once the cluster has been rebalanced we wait for the
    disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the
    curr_items_total. We also check for data and its meta-data, vbucket sequene numbers"""

    def rebalance_out_after_ops(self):
        gen_delete = BlobGenerator('mike',
                                   'mike-',
                                   self.value_size,
                                   start=self.num_items / 2,
                                   end=self.num_items)
        gen_create = BlobGenerator('mike',
                                   'mike-',
                                   self.value_size,
                                   start=self.num_items + 1,
                                   end=self.num_items * 3 / 2)
        # define which doc's ops will be performed during rebalancing
        # allows multiple of them but one by one
        tasks = []
        if (self.doc_ops is not None):
            if ("update" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master,
                                                      self.gen_update,
                                                      "update", 0)
            if ("create" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master, gen_create,
                                                      "create", 0)
            if ("delete" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master, gen_delete,
                                                      "delete", 0)
            for task in tasks:
                task.result()
        servs_out = [
            self.servers[self.num_servers - i - 1]
            for i in range(self.nodes_out)
        ]
        self._verify_stats_all_buckets(self.servers[:self.num_servers],
                                       timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.num_servers])
        prev_failover_stats = self.get_failovers_logs(
            self.servers[:self.num_servers], self.buckets)
        prev_vbucket_stats = self.get_vbucket_seqnos(
            self.servers[:self.num_servers], self.buckets)
        record_data_set = self.get_data_set_all(
            self.servers[:self.num_servers], self.buckets)
        self.compare_vbucketseq_failoverlogs(prev_vbucket_stats,
                                             prev_failover_stats)
        rebalance = self.cluster.async_rebalance(self.servers[:1], [],
                                                 servs_out)
        rebalance.result()
        self._verify_stats_all_buckets(self.servers[:self.num_servers -
                                                    self.nodes_out],
                                       timeout=120)
        self.verify_cluster_stats(self.servers[:self.num_servers -
                                               self.nodes_out],
                                  check_ep_items_remaining=True)
        new_failover_stats = self.compare_failovers_logs(
            prev_failover_stats,
            self.servers[:self.num_servers - self.nodes_out], self.buckets)
        new_vbucket_stats = self.compare_vbucket_seqnos(
            prev_vbucket_stats,
            self.servers[:self.num_servers - self.nodes_out],
            self.buckets,
            perNode=False)
        self.sleep(60)
        self.data_analysis_all(
            record_data_set, self.servers[:self.num_servers - self.nodes_out],
            self.buckets)
        self.compare_vbucketseq_failoverlogs(new_vbucket_stats,
                                             new_failover_stats)
        self.verify_unacked_bytes_all_buckets()
        nodes = self.get_nodes_in_cluster(self.master)
        self.vb_distribution_analysis(servers=nodes,
                                      buckets=self.buckets,
                                      std=1.0,
                                      total_vbuckets=self.total_vbuckets)

    """Rebalances nodes out with failover and full recovery add back of a node

    This test begins with all servers clustered together and  loads a user defined
    number of items into the cluster. Before rebalance we perform docs ops(add/remove/update/read)
    in the cluster( operate with a half of items that were loaded before).It then remove nodes_out
    from the cluster at a time and rebalances.  Once the cluster has been rebalanced we wait for the
    disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the
    curr_items_total. We also check for data and its meta-data, vbucket sequene numbers"""

    def rebalance_out_with_failover_full_addback_recovery(self):
        gen_delete = BlobGenerator('mike',
                                   'mike-',
                                   self.value_size,
                                   start=self.num_items / 2,
                                   end=self.num_items)
        gen_create = BlobGenerator('mike',
                                   'mike-',
                                   self.value_size,
                                   start=self.num_items + 1,
                                   end=self.num_items * 3 / 2)
        # define which doc's ops will be performed during rebalancing
        # allows multiple of them but one by one
        tasks = []
        if (self.doc_ops is not None):
            if ("update" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master,
                                                      self.gen_update,
                                                      "update", 0)
            if ("create" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master, gen_create,
                                                      "create", 0)
            if ("delete" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master, gen_delete,
                                                      "delete", 0)
            for task in tasks:
                task.result()
        servs_out = [
            self.servers[self.num_servers - i - 1]
            for i in range(self.nodes_out)
        ]
        self._verify_stats_all_buckets(self.servers[:self.num_servers],
                                       timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.num_servers])
        self.rest = RestConnection(self.master)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
        self.sleep(20)
        prev_failover_stats = self.get_failovers_logs(
            self.servers[:self.num_servers], self.buckets)
        prev_vbucket_stats = self.get_vbucket_seqnos(
            self.servers[:self.num_servers], self.buckets)
        record_data_set = self.get_data_set_all(
            self.servers[:self.num_servers], self.buckets)
        self.compare_vbucketseq_failoverlogs(prev_vbucket_stats,
                                             prev_failover_stats)
        # Mark Node for failover
        success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False)
        # Mark Node for full recovery
        if success_failed_over:
            self.rest.set_recovery_type(otpNode=chosen[0].id,
                                        recoveryType="full")
        rebalance = self.cluster.async_rebalance(self.servers[:1], [],
                                                 servs_out)
        rebalance.result()
        self.verify_cluster_stats(self.servers[:self.num_servers -
                                               self.nodes_out],
                                  check_ep_items_remaining=True)
        self.compare_failovers_logs(
            prev_failover_stats,
            self.servers[:self.num_servers - self.nodes_out], self.buckets)
        self.sleep(30)
        self.data_analysis_all(
            record_data_set, self.servers[:self.num_servers - self.nodes_out],
            self.buckets)
        self.verify_unacked_bytes_all_buckets()
        nodes = self.get_nodes_in_cluster(self.master)
        self.vb_distribution_analysis(servers=nodes,
                                      buckets=self.buckets,
                                      std=1.0,
                                      total_vbuckets=self.total_vbuckets)

    """Rebalances nodes out with failover

    This test begins with all servers clustered together and  loads a user defined
    number of items into the cluster. Before rebalance we perform docs ops(add/remove/update/read)
    in the cluster( operate with a half of items that were loaded before).It then remove nodes_out
    from the cluster at a time and rebalances.  Once the cluster has been rebalanced we wait for the
    disk queues to drain, and then verify that there has been no data loss, sum(curr_items) match the
    curr_items_total. We also check for data and its meta-data, vbucket sequene numbers"""

    def rebalance_out_with_failover(self):
        fail_over = self.input.param("fail_over", False)
        self.rest = RestConnection(self.master)
        gen_delete = BlobGenerator('mike',
                                   'mike-',
                                   self.value_size,
                                   start=self.num_items / 2,
                                   end=self.num_items)
        gen_create = BlobGenerator('mike',
                                   'mike-',
                                   self.value_size,
                                   start=self.num_items + 1,
                                   end=self.num_items * 3 / 2)
        # define which doc's ops will be performed during rebalancing
        # allows multiple of them but one by one
        tasks = []
        if (self.doc_ops is not None):
            if ("update" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master,
                                                      self.gen_update,
                                                      "update", 0)
            if ("create" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master, gen_create,
                                                      "create", 0)
            if ("delete" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master, gen_delete,
                                                      "delete", 0)
            for task in tasks:
                task.result()
        ejectedNode = self.find_node_info(self.master,
                                          self.servers[self.nodes_init - 1])
        self._verify_stats_all_buckets(self.servers[:self.num_servers],
                                       timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.num_servers])
        self.sleep(20)
        prev_failover_stats = self.get_failovers_logs(
            self.servers[:self.nodes_init], self.buckets)
        prev_vbucket_stats = self.get_vbucket_seqnos(
            self.servers[:self.nodes_init], self.buckets)
        record_data_set = self.get_data_set_all(self.servers[:self.nodes_init],
                                                self.buckets)
        self.compare_vbucketseq_failoverlogs(prev_vbucket_stats,
                                             prev_failover_stats)
        self.rest = RestConnection(self.master)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
        new_server_list = self.add_remove_servers(
            self.servers, self.servers[:self.nodes_init],
            [self.servers[self.nodes_init - 1], chosen[0]], [])
        # Mark Node for failover
        success_failed_over = self.rest.fail_over(chosen[0].id,
                                                  graceful=fail_over)
        self.nodes = self.rest.node_statuses()
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[chosen[0].id, ejectedNode.id])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True),
                        msg="Rebalance failed")
        self.verify_cluster_stats(new_server_list,
                                  check_ep_items_remaining=True)
        self.sleep(30)
        self.data_analysis_all(record_data_set, new_server_list, self.buckets)
        self.verify_unacked_bytes_all_buckets()
        nodes = self.get_nodes_in_cluster(self.master)
        self.vb_distribution_analysis(servers=nodes,
                                      buckets=self.buckets,
                                      std=1.0,
                                      total_vbuckets=self.total_vbuckets)

    """Rebalances nodes out of a cluster while doing docs ops:create, delete, update.

    This test begins with all servers clustered together and  loads a user defined
    number of items into the cluster. It then remove nodes_out from the cluster at a time
    and rebalances. During the rebalance we perform docs ops(add/remove/update/read)
    in the cluster( operate with a half of items that were loaded before).
    Once the cluster has been rebalanced we wait for the disk queues to drain,
    and then verify that there has been no data loss, sum(curr_items) match the curr_items_total.
    Once all nodes have been rebalanced the test is finished."""

    def rebalance_out_with_ops(self):
        tasks = list()
        gen_delete = BlobGenerator('mike',
                                   'mike-',
                                   self.value_size,
                                   start=self.num_items / 2,
                                   end=self.num_items)
        gen_create = BlobGenerator('mike',
                                   'mike-',
                                   self.value_size,
                                   start=self.num_items + 1,
                                   end=self.num_items * 3 / 2)
        servs_out = [
            self.servers[self.num_servers - i - 1]
            for i in range(self.nodes_out)
        ]
        # define which doc's ops will be performed during rebalancing
        # allows multiple of them but one by one
        if self.doc_ops is not None:
            if "update" in self.doc_ops:
                tasks += self._async_load_all_buckets(self.master,
                                                      self.gen_update,
                                                      "update", 0)
            if "create" in self.doc_ops:
                tasks += self._async_load_all_buckets(self.master, gen_create,
                                                      "create", 0)
            if "delete" in self.doc_ops:
                tasks += self._async_load_all_buckets(self.master, gen_delete,
                                                      "delete", 0)
        rebalance = self.cluster.async_rebalance(
            self.servers[:1], [],
            servs_out,
            sleep_before_rebalance=self.sleep_before_rebalance)

        rebalance.result()
        for task in tasks:
            task.result()

        self.verify_cluster_stats(self.servers[:self.num_servers -
                                               self.nodes_out])
        self.verify_unacked_bytes_all_buckets()

        # Validate seq_no snap_start/stop values after rebalance
        self.check_snap_start_corruption()

    """Rebalances nodes out of a cluster while doing docs ops:create, delete, update along with compaction.

    This test begins with all servers clustered together and  loads a user defined
    number of items into the cluster. It then remove nodes_out from the cluster at a time
    and rebalances. During the rebalance we perform docs ops(add/remove/update/read)
    in the cluster( operate with a half of items that were loaded before).
    Once the cluster has been rebalanced we wait for the disk queues to drain,
    and then verify that there has been no data loss, sum(curr_items) match the curr_items_total.
    Once all nodes have been rebalanced the test is finished."""

    def rebalance_out_with_compaction_and_ops(self):
        gen_delete = BlobGenerator('mike',
                                   'mike-',
                                   self.value_size,
                                   start=self.num_items / 2,
                                   end=self.num_items)
        gen_create = BlobGenerator('mike',
                                   'mike-',
                                   self.value_size,
                                   start=self.num_items + 1,
                                   end=self.num_items * 3 / 2)
        servs_out = [
            self.servers[self.num_servers - i - 1]
            for i in range(self.nodes_out)
        ]
        tasks = [self.cluster.async_rebalance(self.servers[:1], [], servs_out)]
        for bucket in self.buckets:
            tasks.append(self.cluster.async_compact_bucket(
                self.master, bucket))
        # define which doc's ops will be performed during rebalancing
        # allows multiple of them but one by one
        if (self.doc_ops is not None):
            if ("update" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master,
                                                      self.gen_update,
                                                      "update", 0)
            if ("create" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master, gen_create,
                                                      "create", 0)
            if ("delete" in self.doc_ops):
                tasks += self._async_load_all_buckets(self.master, gen_delete,
                                                      "delete", 0)
        for task in tasks:
            task.result()
        self.verify_cluster_stats(self.servers[:self.num_servers -
                                               self.nodes_out])
        self.verify_unacked_bytes_all_buckets()

    """Rebalances nodes from a cluster during getting random keys.

    This test begins with all servers clustered together and loads a user defined
    number of items into the cluster. Then we send requests to all nodes in the cluster
    to get random key values. Next step is remove nodes_out from the cluster
    and rebalance it. During rebalancing we get random keys from all nodes and verify
    that are different every time. Once the cluster has been rebalanced
    we again get random keys from all new nodes in the cluster,
    than we wait for the disk queues to drain, and then verify that there has been no data loss,
    sum(curr_items) match the curr_items_total."""

    def rebalance_out_get_random_key(self):
        servs_out = [
            self.servers[self.num_servers - i - 1]
            for i in range(self.nodes_out)
        ]
        # get random keys for new added nodes
        rest_cons = [
            RestConnection(self.servers[i]) for i in xrange(self.num_servers)
        ]
        list_threads = []
        for rest in rest_cons:
            t = Thread(target=rest.get_random_key,
                       name="get_random_key",
                       args=(self.default_bucket_name, ))
            list_threads.append(t)
            t.start()
        [t.join() for t in list_threads]

        rest_cons = [
            RestConnection(self.servers[i])
            for i in xrange(self.num_servers - self.nodes_out)
        ]
        rebalance = self.cluster.async_rebalance(
            self.servers[:self.num_servers], [], servs_out)
        self.sleep(2)
        result = []
        num_iter = 0
        # get random keys for each node during rebalancing
        while rest_cons[0]._rebalance_progress_status(
        ) == 'running' and num_iter < 100:
            list_threads = []
            temp_result = []
            self.log.info("getting random keys for all nodes in cluster....")
            for rest in rest_cons:
                t = Thread(target=rest.get_random_key,
                           name="get_random_key",
                           args=(self.default_bucket_name, ))
                list_threads.append(t)
                temp_result.append(
                    rest.get_random_key(self.default_bucket_name))

                t.start()
            [t.join() for t in list_threads]

            if tuple(temp_result) == tuple(result):
                self.log.exception("random keys are not changed")
            else:
                result = temp_result
            num_iter += 1

        rebalance.result()
        self.verify_cluster_stats(self.servers[:self.num_servers -
                                               self.nodes_out])
        self.verify_unacked_bytes_all_buckets()

    """Rebalances nodes out of a cluster while doing docs' ops.

    This test begins with all servers clustered together and loads a user
    defined number of items into the cluster. It then removes two nodes at a
    time from the cluster and rebalances. During the rebalance we update
    (all of the items in the cluster)/delete(num_items/(num_servers -1) in
    each iteration)/create(a half of initial items in each iteration).
    Once the cluster has been rebalanced the test waits for the disk queues
    to drain and then verifies that there has been no data loss,
    sum(curr_items) match the curr_items_total.
    Once all nodes have been rebalanced out of the cluster the test finishes.
    """

    def incremental_rebalance_out_with_ops(self):
        batch_size = 1000
        for i in reversed(range(1, self.num_servers, 2)):
            if i == 1:
                batch_size = 1
            tasks = list()
            if self.doc_ops is not None:
                # define which doc_ops will be performed during rebalancing
                # only one type of ops can be passed
                if "update" in self.doc_ops:
                    # 1/2th of data will be updated in each iteration
                    tasks += self._async_load_all_buckets(
                        self.master,
                        self.gen_update,
                        "update",
                        0,
                        batch_size=batch_size)
                elif "create" in self.doc_ops:
                    # 1/2th of initial data will be added in each iteration
                    gen_create = BlobGenerator(
                        'mike',
                        'mike-',
                        self.value_size,
                        start=self.num_items * (self.num_servers - i) / 2.0,
                        end=self.num_items * (self.num_servers - i + 1) / 2.0)
                    tasks += self._async_load_all_buckets(
                        self.master,
                        gen_create,
                        "create",
                        0,
                        batch_size=batch_size)
                elif "delete" in self.doc_ops:
                    # 1/(num_servers) of initial data will be removed after each iteration
                    # at the end we should get empty base( or couple items)
                    gen_delete = BlobGenerator(
                        'mike',
                        'mike-',
                        self.value_size,
                        start=int(self.num_items *
                                  (1 - i / (self.num_servers - 1.0))) + 1,
                        end=int(self.num_items * (1 - (i - 1) /
                                                  (self.num_servers - 1.0))))
                    tasks += self._async_load_all_buckets(
                        self.master,
                        gen_delete,
                        "delete",
                        0,
                        batch_size=batch_size)
            rebalance = self.cluster.async_rebalance(
                self.servers[:i], [],
                self.servers[i:i + 2],
                sleep_before_rebalance=self.sleep_before_rebalance)
            try:
                rebalance.result()
                for task in tasks:
                    task.result()
            except Exception, ex:
                rebalance.cancel()
                for task in tasks:
                    task.cancel()
                raise ex

            self.verify_cluster_stats(self.servers[:i])
            # Validate seq_no snap_start/stop values after rebalance
            self.check_snap_start_corruption()

        self.verify_unacked_bytes_all_buckets()
Ejemplo n.º 60
0
    def setUp(self):
        super(CollectionBase, self).setUp()
        self.log_setup_status("CollectionBase", "started")
        self.key = 'test_collection'.rjust(self.key_size, '0')
        self.simulate_error = self.input.param("simulate_error", None)
        self.error_type = self.input.param("error_type", "memory")
        self.doc_ops = self.input.param("doc_ops", None)
        self.spec_name = self.input.param("bucket_spec",
                                          "single_bucket.default")
        self.over_ride_spec_params = \
            self.input.param("override_spec_params", "").split(";")

        self.action_phase = self.input.param("action_phase",
                                             "before_default_load")
        self.crud_batch_size = 100
        self.num_nodes_affected = 1
        if self.num_replicas > 1:
            self.num_nodes_affected = 2

        if self.doc_ops:
            self.doc_ops = self.doc_ops.split(';')

        self.durability_helper = DurabilityHelper(
            self.log, len(self.cluster.nodes_in_cluster),
            self.durability_level)

        # Initialize cluster using given nodes
        nodes_init = self.cluster.servers[1:self.nodes_init] \
            if self.nodes_init != 1 else []
        self.task.rebalance([self.cluster.master], nodes_init, [])
        self.cluster.nodes_in_cluster.extend([self.cluster.master] +
                                             nodes_init)

        # Disable auto-failover to avoid failover of nodes
        status = RestConnection(self.cluster.master) \
            .update_autofailover_settings(False, 120, False)
        self.assertTrue(status, msg="Failure during disabling auto-failover")

        # Create bucket(s) and add rbac user
        self.bucket_util.add_rbac_user()
        buckets_spec = self.bucket_util.get_bucket_template_from_package(
            self.spec_name)
        doc_loading_spec = \
            self.bucket_util.get_crud_template_from_package("initial_load")

        # Process params to over_ride values if required
        self.over_ride_template_params(buckets_spec)
        self.over_ride_template_params(doc_loading_spec)

        # MB-38438, adding CollectionNotFoundException in retry exception
        doc_loading_spec[MetaCrudParams.RETRY_EXCEPTIONS].append(
            SDKException.CollectionNotFoundException)

        self.bucket_util.create_buckets_using_json_data(buckets_spec)
        self.bucket_util.wait_for_collection_creation_to_complete()

        # Init sdk_client_pool if not initialized before
        if self.sdk_client_pool is None:
            self.init_sdk_pool_object()

        # Create clients in SDK client pool
        self.log.info("Creating required SDK clients for client_pool")
        bucket_count = len(self.bucket_util.buckets)
        max_clients = self.task_manager.number_of_threads
        clients_per_bucket = int(ceil(max_clients / bucket_count))
        for bucket in self.bucket_util.buckets:
            self.sdk_client_pool.create_clients(
                bucket, [self.cluster.master],
                clients_per_bucket,
                compression_settings=self.sdk_compression)

        # TODO: remove this once the bug is fixed
        self.sleep(60, "MB-38497")

        doc_loading_task = \
            self.bucket_util.run_scenario_from_spec(
                self.task,
                self.cluster,
                self.bucket_util.buckets,
                doc_loading_spec,
                mutation_num=0)
        if doc_loading_task.result is False:
            self.fail("Initial doc_loading failed")

        self.cluster_util.print_cluster_stats()

        # Verify initial doc load count
        self.bucket_util._wait_for_stats_all_buckets()
        self.bucket_util.validate_docs_per_collections_all_buckets()

        self.bucket_util.print_bucket_stats()
        self.bucket_helper_obj = BucketHelper(self.cluster.master)
        self.log_setup_status("CollectionBase", "complete")