def test_simple_ui_request(self):
        rest = RestConnection(self.master)
        passed = True
        self.log.info("GET " + rest.query_baseUrl)
        status, content, header = rest._http_request(rest.query_baseUrl)
        self.log.info(header)
        if not status:
            self.log.info("wrong status for {0} GET request {1}".format(
                rest.query_baseUrl, header))
            passed = False
        self.assertTrue(passed, msg="some GET requests failed. See logs above")

        _, content, _ = rest._http_request(rest.query_baseUrl)
        occurrences = [
            m.start() for m in re.finditer('web request failed', content)
        ]
        for occurrence in occurrences:
            subcontent = content[occurrence - 1000:occurrence + 1000]
            if 'path,"/diag"' in subcontent:
                break
            else:
                passed = False
                self.log.info(subcontent)
            self.assertTrue(
                passed,
                "some web request failed in the server logs. See logs above")
Ejemplo n.º 2
0
 def delete_task(self, state, repo_id, task_category, task_name):
     """ Delete a task
     """
     rest = RestConnection(self.server)
     assert(task_category in ['one-off', 'scheduled'])
     status, content, header = rest._http_request(
         rest.baseUrl + "_p/backup/internal/v1/cluster/self/repository/{}/{}/task/{}/{}".format(state, repo_id, task_category, task_name), 'DELETE')
Ejemplo n.º 3
0
 def _retrieve_user_roles(self):
     rest = RestConnection(self.master_ip)
     url = "/settings/rbac/users"
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'GET')
     log.info(" Retrieve User Roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header))
     return status, content, header
Ejemplo n.º 4
0
 def get_index_storage_stats(self, index_node=None, timeout=120):
     if index_node is None:
         index_node = self.index_node
     api = self.get_index_baseURL() + 'stats/storage'
     self.log.info("api is:" + str(api))
     content = None
     counter = 0
     while content is None:
         rest_client = RestConnection(index_node)
         status, content, header = rest_client._http_request(
             api, timeout=timeout)
         if not status:
             raise Exception(content)
         counter += 1
         if counter > 10:
             break
     json_parsed = json.loads(content)
     index_storage_stats = {}
     for index_stats in json_parsed:
         bucket = index_stats["Index"].split(":")[0]
         index_name = index_stats["Index"].split(":")[-1]
         if bucket not in list(index_storage_stats.keys()):
             index_storage_stats[bucket] = {}
         index_storage_stats[bucket][index_name] = index_stats["Stats"]
     return index_storage_stats
Ejemplo n.º 5
0
 def test_simple_ui_request(self):
     rest = RestConnection(self.master)
     passed = True
     for api in ["", "versions", "pools", "pools/default", "pools/nodes", "pools/default/overviewStats",
                 "pools/default/buckets", "pools/default/buckets/@query/stats", "pools/default/nodeServices",
                 "pools/default/remoteClusters", "pools/default/serverGroups", "pools/default/certificate",
                 "pools/default/settings/memcached/global", "nodeStatuses", "logs", "settings/web",
                 "settings/alerts",
                 "settings/stats", "settings/autoFailover", "settings/maxParallelIndexers",
                 "settings/viewUpdateDaemon",
                 "settings/autoCompaction", "settings/replications", "settings/replications",
                 "settings/saslauthdAuth", "settings/audit", "internalSettings", "nodes/self/xdcrSSLPorts",
                 "indexStatus",
                 #"diag/vbuckets", MB-15080
                 "settings/indexes", "diag",
                 #"diag/ale", MB-15080
                 "pools/default/rebalanceProgress", "pools/default/tasks", "index.html", "sasl_logs",
                 "couchBase", "sampleBuckets"]:
         url = rest.baseUrl + api
         self.log.info("GET " + url)
         try:
             status, content, header = rest._http_request(url)
         except IncompleteRead, e:
             self.log.warn("size of partial responce {0} api is {1} bytes".format(api, sys.getsizeof(e.partial)))
             if api != "diag":
                 #otherwise for /diag API we should increase request time for dynamic data in _http_request
                 passed = False
             continue
         self.log.info(header)
         if not self.is_linux and api == "settings/saslauthdAuth":
             #This http API endpoint is only supported in enterprise edition running on GNU/Linux
             continue
         if not status:
             self.log.info("wrong status for {0} GET request {1}".format(url, header))
             passed = False
Ejemplo n.º 6
0
 def is_backup_service_running(self):
     """ Returns true if the backup service is running.
     """
     rest = RestConnection(self.server)
     return 'backupAPI' in json.loads(
         rest._http_request(rest.baseUrl + "pools/default/nodeServices")
         [1])['nodesExt'][0]['services'].keys()
Ejemplo n.º 7
0
 def _retrieve_user_roles(self):
     rest = RestConnection(self.master_ip)
     url = "/settings/rbac/users"
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'GET')
     log.info(" Retrieve User Roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header))
     return status, content, header
Ejemplo n.º 8
0
 def upload_client_cert_settings(self, server=None):
     """
     Upload client cert settings(that was initialized in init function) to CB server
     """
     if server is None:
         server = self.host
     with open(x509main.CACERTFILEPATH + x509main.CLIENT_CERT_AUTH_JSON,
               'rb') as fh:
         data = fh.read()
     self.log.info("Client cert to be Uploaded -- {0}".format(data))
     rest = RestConnection(server)
     authorization = base64.encodestring(('%s:%s' %
                                          (rest.username, rest.password)).encode()).decode().rstrip("\n")
     headers = {'Content-Type': 'application/octet-stream',
                'Authorization': 'Basic %s' % authorization,
                'Accept': '*/*'}
     url = "settings/clientCertAuth"
     api = rest.baseUrl + url
     tries = 0
     while tries < 4:
         status, content, response = rest._http_request(
             api, method='POST', params=data, headers=headers, timeout=300)
         if status:
             return content
         else:
             tries = tries + 1
             if tries >= 4:
                 raise Exception(content)
Ejemplo n.º 9
0
class NsServerIngress(UserResourceTask):
    """ Produces throughput for ingress_mib_per_min """

    def __init__(self, user, node):
        super(NsServerIngress, self).__init__(user, node)
        self.rest = RestConnection(node)
        self.workers = []
        self.threads = 10

        for _ in range(self.threads):
            self.workers.append(IngressThroughputWorker(self.node))

        for worker in self.workers:
            worker.start()

    def on_throughput_update(self, throughput):
        for worker in self.workers:
            worker.throughput.set(throughput / self.threads)

        if throughput == 0:
            for worker in self.workers:
                worker.stop()

    on_throughput_increase = on_throughput_update
    on_throughput_decrease = on_throughput_update

    def get_throughput_success(self):
        return sum(worker.throughput_success.get() for worker in self.workers)

    def error(self):
        return self.rest._http_request(self.rest.baseUrl + "/pools/default")[1]

    def expected_error(self):
        return 'Limit(s) exceeded [ingress]'
Ejemplo n.º 10
0
    def test_simple_ui_request(self):
        rest = RestConnection(self.master)
        passed = True
        for api in ["", "versions", "pools", "pools/default", "pools/nodes", "pools/default/overviewStats",
                    "pools/default/buckets", "pools/default/buckets/@query/stats", "pools/default/nodeServices",
                    "pools/default/remoteClusters", "pools/default/serverGroups", "pools/default/certificate",
                    "pools/default/settings/memcached/global", "nodeStatuses", "logs", "settings/web",
                    "settings/alerts",
                    "settings/stats", "settings/autoFailover", "settings/maxParallelIndexers",
                    "settings/viewUpdateDaemon",
                    "settings/autoCompaction", "settings/replications", "settings/replications",
                    "settings/saslauthdAuth", "settings/audit", "internalSettings", "nodes/self/xdcrSSLPorts",
                    "indexStatus",
                    #"diag/vbuckets", MB-15080
                    "settings/indexes", "diag",
                    #"diag/ale", MB-15080
                    "pools/default/rebalanceProgress", "pools/default/tasks", "index.html", "sasl_logs",
                    "couchBase", "sampleBuckets"]:
            url = rest.baseUrl + api
            self.log.info("GET " + url)
            try:
                status, content, header = rest._http_request(url)
            except IncompleteRead as e:
                self.log.warning("size of partial responce {0} api is {1} bytes".format(api, sys.getsizeof(e.partial)))
                if api != "diag":
                    #otherwise for /diag API we should increase request time for dynamic data in _http_request
                    passed = False
                continue
            self.log.info(header)
            if not self.is_linux and api == "settings/saslauthdAuth":
                #This http API endpoint is only supported in enterprise edition running on GNU/Linux
                continue
            if not status:
                self.log.info("wrong status for {0} GET request {1}".format(url, header))
                passed = False
        self.assertTrue(passed, msg="some GET requests failed. See logs above")

        _, content, _ = rest._http_request(rest.baseUrl + "sasl_logs")
        occurrences = [m.start() for m in re.finditer('web request failed', str(content))]
        for occurrence in occurrences:
            subcontent = content[occurrence - 1000: occurrence + 1000]
            if 'path,"/diag"' in str(subcontent):
                break
            else:
                passed = False
                self.log.info(subcontent)
            self.assertTrue(passed, "some web request failed in the server logs. See logs above")
Ejemplo n.º 11
0
 def _retrive_all_user_role(self, user_list=None ):
     server = self.master_ip
     rest = RestConnection(server)
     url = "/settings/rbac/roles"
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'GET')
     log.info(" Retrieve all User roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header))
     return status, content, header
Ejemplo n.º 12
0
 def _retrive_all_user_role(self, user_list=None ):
     server = self.master_ip
     rest = RestConnection(server)
     url = "/settings/rbac/roles"
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'GET')
     log.info(" Retrieve all User roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header))
     return status, content, header
Ejemplo n.º 13
0
 def _delete_user(self, user_name):
     rest = RestConnection(self.master_ip)
     url = "/settings/rbac/users/" + user_name
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'DELETE')
     print status
     print content
     print header
     return status, content, header
Ejemplo n.º 14
0
 def _retrieve_user_roles(self):
     rest = RestConnection(self.master_ip)
     url = "/settings/rbac/users"
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'GET')
     print status
     print content
     print header
     return status, content, header
Ejemplo n.º 15
0
 def _delete_user(self,user_name):
     rest = RestConnection(self.master_ip)
     url = "/settings/rbac/users/" + user_name
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'DELETE')
     print status
     print content
     print header
     return status, content, header
Ejemplo n.º 16
0
 def enable_ldap(self):
     rest = RestConnection(self.master)
     api = rest.baseUrl + 'settings/saslauthdAuth'
     params = urllib.urlencode({
         "enabled": 'true',
         "admins": [],
         "roAdmins": []
     })
     status, content, header = rest._http_request(api, 'POST', params)
Ejemplo n.º 17
0
 def _retrieve_user_roles(self):
     rest = RestConnection(self.master_ip)
     url = "/settings/rbac/users"
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'GET')
     print status
     print content
     print header
     return status, content, header
Ejemplo n.º 18
0
 def set_metadata_purge_interval(self, interval=0.04):
     # set it to 0.04 ie 1 hour if not given
     rest = RestConnection(self.cluster.master)
     params = {}
     api = rest.baseUrl + "controller/setAutoCompaction"
     params["purgeInterval"] = interval
     params["parallelDBAndViewCompaction"] = "false"
     params = urllib.urlencode(params)
     return rest._http_request(api, "POST", params)
Ejemplo n.º 19
0
 def set_index_settings(self, setting_json, index_node):
     plasma_obj = PlasmaStatsUtil(index_node, server_task=self.task)
     api = plasma_obj.get_index_baseURL() + 'settings'
     rest_client = RestConnection(index_node)
     status, content, header = rest_client._http_request(
         api, 'POST', json.dumps(setting_json))
     if not status:
         raise Exception(content)
     self.log.info("{0} set".format(setting_json))
Ejemplo n.º 20
0
 def _set_user_roles(self,user_name,payload):
     rest = RestConnection(self.master_ip)
     if self.auth_type == "ldap" or self.auth_type == "pam":
         url = "settings/rbac/users/" + user_name
     elif self.auth_type == 'builtin':
         url = "settings/rbac/users/local/" + user_name
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'PUT', params=payload)
     log.info(" Set User Roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header))
     return status, content, header
Ejemplo n.º 21
0
 def _delete_user(self,user_name):
     rest = RestConnection(self.master_ip)
     if self.auth_type == 'ldap' or self.auth_type == "pam":
         url = "/settings/rbac/users/" + user_name
     else:
         url = "settings/rbac/users/local/" + user_name
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'DELETE')
     log.info (" Deleting User - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header))
     return status, content, header
Ejemplo n.º 22
0
 def _delete_user(self,user_name):
     rest = RestConnection(self.master_ip)
     if self.auth_type == 'ldap' or self.auth_type == "pam":
         url = "/settings/rbac/users/external/" + user_name
     else:
         url = "settings/rbac/users/local/" + user_name
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'DELETE')
     log.info (" Deleting User - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header))
     return status, content, header
Ejemplo n.º 23
0
 def _retrive_all_user_role(self, user_list=None):
     server = self.master_ip
     rest = RestConnection(server)
     url = "/settings/rbac/roles"
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'GET')
     print status
     print content
     print header
     return status, content, header
Ejemplo n.º 24
0
 def _retrive_all_user_role(self, user_list=None ):
     server = self.master_ip
     rest = RestConnection(server)
     url = "/settings/rbac/roles"
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'GET')
     print status
     print content
     print header
     return status, content, header
Ejemplo n.º 25
0
 def _set_user_roles(self,user_name,payload):
     rest = RestConnection(self.master_ip)
     if self.auth_type == "ldap" or self.auth_type == "pam":
         url = "settings/rbac/users/external/" + user_name
     elif self.auth_type == 'builtin':
         url = "settings/rbac/users/local/" + user_name
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'PUT', params=payload)
     log.info(" Set User Roles - Status - {0} -- Content - {1} -- Header - {2}".format(status, content, header))
     return status, content, header
Ejemplo n.º 26
0
 def _reload_node_certificate(self, host):
     rest = RestConnection(host)
     api = rest.baseUrl + "node/controller/reloadCertificate"
     http = httplib2.Http(disable_ssl_certificate_validation=self.
                          disable_ssl_certificate_validation)
     status, content, header = rest._http_request(
         api,
         'POST',
         headers=self._create_rest_headers('Administrator', 'password'))
     return status, content
Ejemplo n.º 27
0
 def _check_user_permission(self,user_name,password,permission_set):
     rest = RestConnection(self.master_ip)
     url = "pools/default/checkPermissions/"
     param = permission_set
     api = rest.baseUrl + url
     authorization = base64.encodestring('%s:%s' % (user_name, password))
     header =  {'Content-Type': 'application/x-www-form-urlencoded',
             'Authorization': 'Basic %s' % authorization,
             'Accept': '*/*'}
     status, content, header = rest._http_request(api, 'POST', params=param,headers=header)
     return status, content, header
Ejemplo n.º 28
0
 def _set_user_roles(self,user_name,payload):
     rest = RestConnection(self.master_ip)
     url = "settings/rbac/users/" + user_name
     #param = urllib.urlencode(payload)
     param = payload
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'PUT', param)
     print status
     print content
     print header
     return status, content, header
Ejemplo n.º 29
0
 def _set_user_roles(self, user_name, payload):
     rest = RestConnection(self.master_ip)
     url = "settings/rbac/users/" + user_name
     #param = urllib.urlencode(payload)
     param = payload
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, 'PUT', param)
     print status
     print content
     print header
     return status, content, header
    def test_simple_ui_request(self):
        rest = RestConnection(self.master)
        passed = True
        self.log.info("GET " + rest.query_baseUrl)
        status, content, header = rest._http_request(rest.query_baseUrl)
        self.log.info(header)
        if not status:
            self.log.info("wrong status for {0} GET request {1}".format(rest.query_baseUrl, header))
            passed = False
        self.assertTrue(passed, msg="some GET requests failed. See logs above")

        _, content, _ = rest._http_request(rest.query_baseUrl)
        occurrences = [m.start() for m in re.finditer('web request failed', content)]
        for occurrence in occurrences:
            subcontent = content[occurrence - 1000: occurrence + 1000]
            if 'path,"/diag"' in subcontent:
                break
            else:
                passed = False
                self.log.info(subcontent)
            self.assertTrue(passed, "some web request failed in the server logs. See logs above")
Ejemplo n.º 31
0
 def _check_user_permission(self,user_name,password,permission_set):
     rest = RestConnection(self.master_ip)
     url = "pools/default/checkPermissions/"
     param = permission_set
     api = rest.baseUrl + url
     authorization = base64.encodestring('%s:%s' % (user_name, password))
     header =  {'Content-Type': 'application/x-www-form-urlencoded',
             'Authorization': 'Basic %s' % authorization,
             'Accept': '*/*'}
     time.sleep(10)
     status, content, header = rest._http_request(api, 'POST', params=param,headers=header)
     return status, content, header
Ejemplo n.º 32
0
 def _rest_client_wrapper(self,username,password, url,method,params,restClient,port=None):
     if restClient == None:
         restClient = self.master_ip
     if port != None:
         restClient.port=port
     rest = RestConnection(restClient)
     rest.username = username
     rest.password = password
     api = rest.baseUrl + url
     status, content, header = rest._http_request(api, method=method, params=params)
     print content
     print status
     return header['status']
Ejemplo n.º 33
0
 def get_all_index_stat_map(self, index_node=None, timeout=120):
     if index_node is None:
         index_node = self.index_node
     rest_client = RestConnection(index_node)
     api = self.get_index_baseURL() + 'stats'
     status, content, header = rest_client._http_request(api,
                                                         timeout=timeout)
     if status:
         json_parsed = json.loads(content)
         index_map = self.get_bucket_index_stats(json_parsed)
         index_stat_map = self.get_indexer_stats(json_parsed)
         index_stat_map['bucket_index_map'] = index_map
         return index_stat_map
Ejemplo n.º 34
0
    def test_non_ssl_ports_after_enabling_tls(self):
        """
        1. Enforce TLS on cluster
        2. For each component make a GET request on non-ssl port,
        and validate that it fails.
        3. Make the same above request on TLS port and validate that it works
        4. Repeat for all components
        5. Disable n2n encryption on all nodes
        6. For each component make a GET request on non-ssl port,
        and validate that it works
        """
        self.enable_tls_encryption_cli_on_nodes(nodes=[self.cluster.master])
        CbServer.use_https = True
        rest = RestConnection(self.cluster.master)
        for non_ssl_request in self.sample_urls_map.keys():
            api = non_ssl_request % self.cluster.master.ip
            try:
                rest._http_request(api=api, timeout=10)
            except Exception as _:
                ssl_request = self.sample_urls_map[non_ssl_request]
                api = ssl_request % self.cluster.master.ip
                status, content, response = rest._http_request(api=api,
                                                               timeout=10)
                if not status:
                    self.fail("{0} failed".format(api))
            else:
                self.log.error("{0} worked".format(api))

        self.disable_n2n_encryption_cli_on_nodes(nodes=[self.cluster.master])
        CbServer.use_https = False
        rest = RestConnection(self.cluster.master)
        for non_ssl_request in self.sample_urls_map.keys():
            api = non_ssl_request % self.cluster.master.ip
            status, content, response = rest._http_request(api=api, timeout=10)
            if not status:
                self.fail("{0} api failed with content {1}".format(
                    api, content))
Ejemplo n.º 35
0
class NsServerEgress(UserResourceTask):
    """ Produces throughput for egress_mib_per_min """
    def __init__(self, user, node):
        super(NsServerEgress, self).__init__(user, node)
        self.rest = RestConnection(node)
        self.workers = []
        self.threads = 10

        for _ in range(self.threads):
            self.workers.append(EgressThroughputWorker(self.node))

        for worker in self.workers:
            worker.start()

    def on_throughput_update(self, throughput):
        """ Updates document size """
        document_size = throughput / (self.threads * self.workers[0].chunks)
        self.rest.set_document("default", "doc24601",
                               create_document_of_size(document_size - 127))

        for worker in self.workers:
            worker.throughput.set(throughput / self.threads)

        if throughput == 0:
            for worker in self.workers:
                worker.stop()

    on_throughput_increase = on_throughput_update
    on_throughput_decrease = on_throughput_update

    def get_throughput_success(self):
        return sum(worker.throughput_success.get() for worker in self.workers)

    def error(self):
        return self.rest._http_request(self.rest.baseUrl + "/pools/default")[1]

    def expected_error(self):
        return 'Limit(s) exceeded [egress]'
Ejemplo n.º 36
0
 def build_info(node):
     rest = RestConnection(node)
     api = rest.baseUrl + 'nodes/self'
     status, content = rest._http_request(api)
     json_parsed = json.loads(content)
     return json_parsed
Ejemplo n.º 37
0
 def rotate_data_key(self, host):
     rest = RestConnection(host)
     api = rest.baseUrl + "/node/controller/rotateDataKey"
     status, content, header = rest._http_request(api, 'POST')
     log.info("Status of rotate data key command - {0}".format(status))
     return status
Ejemplo n.º 38
0
class CommunityTests(CommunityBaseTest):
    def setUp(self):
        super(CommunityTests, self).setUp()
        self.command = self.input.param("command", "")
        self.zone = self.input.param("zone", 1)
        self.replica = self.input.param("replica", 1)
        self.command_options = self.input.param("command_options", '')
        self.set_get_ratio = self.input.param("set_get_ratio", 0.9)
        self.item_size = self.input.param("item_size", 128)
        self.shutdown_zone = self.input.param("shutdown_zone", 1)
        self.do_verify = self.input.param("do-verify", True)
        self.num_node = self.input.param("num_node", 4)
        self.services = self.input.param("services", None)
        self.start_node_services = self.input.param("start_node_services",
                                                    "kv")
        self.add_node_services = self.input.param("add_node_services", "kv")
        self.timeout = 6000
        self.user_add = self.input.param("user_add", None)
        self.user_role = self.input.param("user_role", None)

    def tearDown(self):
        super(CommunityTests, self).tearDown()

    def test_disabled_zone(self):
        disabled_zone = False
        zone_name = "group1"
        serverInfo = self.servers[0]
        self.rest = RestConnection(serverInfo)
        try:
            self.log.info("create zone name 'group1'!")
            result = self.rest.add_zone(zone_name)
            print("result  ", result)
        except Exception as e:
            if e:
                print(e)
                disabled_zone = True
                pass
        if not disabled_zone:
            self.fail("CE version should not have zone feature")

    def check_audit_available(self):
        audit_available = False
        try:
            self.rest.getAuditSettings()
            audit_available = True
        except Exception as e:
            if e:
                print(e)
        if audit_available:
            self.fail("This feature 'audit' only available on "
                      "Enterprise Edition")

    def check_ldap_available(self):
        ldap_available = False
        self.rest = RestConnection(self.master)
        try:
            s, c, h = self.rest.clearLDAPSettings()
            if s:
                ldap_available = True
        except Exception as e:
            if e:
                print(e)
        if ldap_available:
            self.fail("This feature 'ldap' only available on "
                      "Enterprise Edition")

    def check_set_services(self):
        self.rest.force_eject_node()
        self.sleep(7, "wait for node reset done")
        try:
            status = self.rest.init_node_services(hostname=self.master.ip,
                                                  services=[self.services])
        except Exception as e:
            if e:
                print(e)
        if self.services == "kv":
            if status:
                self.log.info("CE could set {0} only service.".format(
                    self.services))
            else:
                self.fail("Failed to set {0} only service.".format(
                    self.services))
        elif self.services == "index,kv":
            if status:
                self.fail("CE does not support kv and index on same node")
            else:
                self.log.info("services enforced in CE")
        elif self.services == "kv,n1ql":
            if status:
                self.fail("CE does not support kv and n1ql on same node")
            else:
                self.log.info("services enforced in CE")
        elif self.services == "kv,eventing":
            if status:
                self.fail("CE does not support kv and eventing on same node")
            else:
                self.log.info("services enforced in CE")
        elif self.services == "index,n1ql":
            if status:
                self.fail("CE does not support index and n1ql on same node")
            else:
                self.log.info("services enforced in CE")
        elif self.services == "index,kv,n1ql":
            if status:
                self.log.info(
                    "CE could set all services {0} on same nodes.".format(
                        self.services))
            else:
                self.fail("Failed to set kv, index and query services on CE")
        elif self.version[:5] in COUCHBASE_FROM_WATSON:
            if self.version[:
                            5] in COUCHBASE_FROM_VULCAN and "eventing" in self.services:
                if status:
                    self.fail("CE does not support eventing in vulcan")
                else:
                    self.log.info("services enforced in CE")
            elif self.services == "fts,index,kv":
                if status:
                    self.fail(
                        "CE does not support fts, index and kv on same node")
                else:
                    self.log.info("services enforced in CE")
            elif self.services == "fts,index,n1ql":
                if status:
                    self.fail(
                        "CE does not support fts, index and n1ql on same node")
                else:
                    self.log.info("services enforced in CE")
            elif self.services == "fts,kv,n1ql":
                if status:
                    self.fail(
                        "CE does not support fts, kv and n1ql on same node")
                else:
                    self.log.info("services enforced in CE")
            elif self.services == "fts,index,kv,n1ql":
                if status:
                    self.log.info(
                        "CE could set all services {0} on same nodes.".format(
                            self.services))
                else:
                    self.fail("Failed to set "
                              "fts, index, kv, and query services on CE")
        else:
            self.fail("some services don't support")

    def check_set_services_when_add_node(self):
        self.rest.force_eject_node()
        sherlock_services_in_ce = ["kv", "index,kv,n1ql"]
        watson_services_in_ce = ["kv", "index,kv,n1ql", "fts,index,kv,n1ql"]
        self.sleep(5, "wait for node reset done")
        kv_quota = 0
        while kv_quota == 0:
            time.sleep(1)
            kv_quota = int(self.rest.get_nodes_self().mcdMemoryReserved)
        info = self.rest.get_nodes_self()
        kv_quota = int(info.mcdMemoryReserved * (CLUSTER_QUOTA_RATIO))
        self.rest.set_service_memoryQuota(service='indexMemoryQuota',
                                          memoryQuota=INDEX_QUOTA)
        self.rest.set_service_memoryQuota(service='ftsMemoryQuota',
                                          memoryQuota=FTS_QUOTA)
        self.rest.init_cluster_memoryQuota(
            self.input.membase_settings.rest_username,
            self.input.membase_settings.rest_password,
            kv_quota - INDEX_QUOTA - FTS_QUOTA - 100)
        try:
            self.log.info("Initialize node with services {0}".format(
                self.start_node_services))
            status = self.rest.init_node_services(
                hostname=self.master.ip, services=[self.start_node_services])
            self.rest.init_cluster()
        except Exception as e:
            if e:
                print(e)
        if not status:
            if self.version not in COUCHBASE_FROM_WATSON and \
                         self.start_node_services not in sherlock_services_in_ce:
                self.log.info(
                    "initial services setting enforced in Sherlock CE")
            elif self.version in COUCHBASE_FROM_WATSON and \
                         self.start_node_services not in watson_services_in_ce:
                self.log.info("initial services setting enforced in Watson CE")

        elif status:
            add_node = False
            try:
                self.log.info("node with services {0} try to add".format(
                    self.add_node_services))
                add_node = self.cluster.rebalance(
                    self.servers[:2],
                    self.servers[1:2], [],
                    services=[self.add_node_services])
            except Exception:
                pass
            if add_node:
                self.get_services_map()
                list_nodes = self.get_nodes_from_services_map(
                    get_all_nodes=True)
                map = self.get_nodes_services()
                if map[self.master.ip] == self.start_node_services and \
                    map[self.servers[1].ip] == self.add_node_services:
                    self.log.info(
                        "services set correctly when node added & rebalance")
                else:
                    self.fail("services set incorrectly when node added & rebalance. "
                        "cluster expected services: {0}; set cluster services {1} ."
                        "add node expected srv: {2}; set add node srv {3}"\
                        .format(map[self.master.ip], self.start_node_services, \
                         map[self.servers[1].ip], self.add_node_services))
            else:
                if self.version not in COUCHBASE_FROM_WATSON:
                    if self.start_node_services in ["kv", "index,kv,n1ql"] and \
                          self.add_node_services not in ["kv", "index,kv,n1ql"]:
                        self.log.info("services are enforced in CE")
                    elif self.start_node_services not in [
                            "kv", "index,kv,n1ql"
                    ]:
                        self.log.info("services are enforced in CE")
                    else:
                        self.fail("maybe bug in add node")
                elif self.version in COUCHBASE_FROM_WATSON:
                    if self.start_node_services in ["kv", "index,kv,n1ql",
                         "fts,index,kv,n1ql"] and self.add_node_services not in \
                                    ["kv", "index,kv,n1ql", "fts,index,kv,n1ql"]:
                        self.log.info("services are enforced in CE")
                    elif self.start_node_services not in [
                            "kv", "index,kv,n1ql", "fts,index,kv,n1ql"
                    ]:
                        self.log.info("services are enforced in CE")
                    else:
                        self.fail("maybe bug in add node")
        else:
            self.fail("maybe bug in node initialization")

    def check_full_backup_only(self):
        """ for windows vm, ask IT to put uniq.exe at
            /cygdrive/c/Program Files (x86)/ICW/bin directory """

        self.remote = RemoteMachineShellConnection(self.master)
        """ put params items=0 in test param so that init items = 0 """
        self.remote.execute_command("{0}cbworkloadgen -n {1}:8091 -j -i 1000 " \
                                    "-u Administrator -p password" \
                                            .format(self.bin_path, self.master.ip))
        """ delete backup location before run backup """
        self.remote.execute_command("rm -rf {0}*".format(self.backup_location))
        output, error = self.remote.execute_command("ls -lh {0}".format(
            self.backup_location))
        self.remote.log_command_output(output, error)
        """ first full backup """
        self.remote.execute_command("{0}cbbackup http://{1}:8091 {2} -m full " \
                                    "-u Administrator -p password"\
                                    .format(self.bin_path,
                                            self.master.ip,
                                            self.backup_c_location))
        output, error = self.remote.execute_command("ls -lh {0}*/".format(
            self.backup_location))
        self.remote.log_command_output(output, error)
        output, error = self.remote.execute_command("{0}cbtransfer -u Administrator "\
                                           "-p password {1}*/*-full/ " \
                                           "stdout: | grep set | uniq | wc -l"\
                                           .format(self.bin_path,
                                                   self.backup_c_location))
        self.remote.log_command_output(output, error)
        if int(output[0]) != 1000:
            self.fail("full backup did not work in CE. "
                      "Expected 1000, actual: {0}".format(output[0]))
        self.remote.execute_command("{0}cbworkloadgen -n {1}:8091 -j -i 1000 "\
                                    " -u Administrator -p password --prefix=t_"
                                    .format(self.bin_path, self.master.ip))
        """ do different backup mode """
        self.remote.execute_command("{0}cbbackup -u Administrator -p password "\
                                    "http://{1}:8091 {2} -m {3}"\
                                    .format(self.bin_path,
                                            self.master.ip,
                                            self.backup_c_location,
                                            self.backup_option))
        output, error = self.remote.execute_command("ls -lh {0}".format(
            self.backup_location))
        self.remote.log_command_output(output, error)
        output, error = self.remote.execute_command("{0}cbtransfer -u Administrator "\
                                           "-p password {1}*/*-{2}/ stdout: "\
                                           "| grep set | uniq | wc -l"\
                                           .format(self.bin_path,
                                                   self.backup_c_location,
                                                   self.backup_option))
        self.remote.log_command_output(output, error)
        if int(output[0]) == 2000:
            self.log.info("backup option 'diff' is enforced in CE")
        elif int(output[0]) == 1000:
            self.fail("backup option 'diff' is not enforced in CE. "
                      "Expected 2000, actual: {0}".format(output[0]))
        else:
            self.fail("backup failed to backup correct items")
        self.remote.disconnect()

    def check_ent_backup(self):
        """ for CE version from Watson, cbbackupmgr exe file should not in bin """
        command = "cbbackupmgr"
        self.remote = RemoteMachineShellConnection(self.master)
        self.log.info("check if {0} in {1} directory".format(
            command, self.bin_path))
        found = self.remote.file_exists(self.bin_path, command)
        if found:
            self.log.info("found {0} in {1} directory".format(
                command, self.bin_path))
            self.log.info("Ent. backup in CE is in bin!")
        elif not found:
            self.fail(
                "CE from Cheshire Cat should contain {0}".format(command))
        self.remote.disconnect()

    def check_memory_optimized_storage_mode(self):
        """ from Watson, CE should not have option 'memory_optimized' to set """
        self.rest.force_eject_node()
        self.sleep(5, "wait for node reset done")
        try:
            self.log.info("Initialize node with 'Memory Optimized' option")
            status = self.rest.set_indexer_storage_mode(
                username=self.input.membase_settings.rest_username,
                password=self.input.membase_settings.rest_password,
                storageMode='memory_optimized')
        except Exception as ex:
            if ex:
                print(ex)
        if not status:
            self.log.info("Memory Optimized setting enforced in CE "
                          "Could not set memory_optimized option")
        else:
            self.fail("Memory Optimzed setting does not enforced in CE "
                      "We could set this option in")

    def check_plasma_storage_mode(self):
        """ from Watson, CE should not have option 'memory_optimized' to set """
        self.rest.force_eject_node()
        self.sleep(5, "wait for node reset done")
        try:
            self.log.info("Initialize node with 'Memory Optimized' option")
            status = self.rest.set_indexer_storage_mode(
                username=self.input.membase_settings.rest_username,
                password=self.input.membase_settings.rest_password,
                storageMode='plasma')
        except Exception as ex:
            if ex:
                print(ex)
        if not status:
            self.log.info("Plasma setting enforced in CE "
                          "Could not set Plasma option")
        else:
            self.fail("Plasma setting does not enforced in CE "
                      "We could set this option in")

    def check_x509_cert(self):
        """ from Watson, X509 certificate only support in EE """
        api = self.rest.baseUrl + "pools/default/certificate?extended=true"
        self.log.info("request to get certificate at "
                      "'pools/default/certificate?extended=true' "
                      "should return False")
        try:
            status, content, header = self.rest._http_request(api, 'GET')
        except Exception as ex:
            if ex:
                print(ex)
        if status:
            self.fail("This X509 certificate feature only available in EE")
        elif not status:
            if b'requires enterprise edition' in content:
                self.log.info("X509 cert is enforced in CE")

    def check_roles_base_access(self):
        """ from Watson, roles base access for admin should not in in CE """
        if self.user_add is None:
            self.fail(
                "We need to pass user name (user_add) to run this test. ")
        if self.user_role is None:
            self.fail(
                "We need to pass user roles (user_role) to run this test. ")
        api = self.rest.baseUrl + "settings/rbac/users/" + self.user_add
        self.log.info("url to run this test: %s" % api)
        """ add admin user """
        param = "name=%s&roles=%s" % (self.user_add, self.user_role)
        try:
            status, content, header = self.rest._http_request(
                api, 'PUT', param)
        except Exception as ex:
            if ex:
                print(ex)
        if status:
            self.fail("CE should not allow to add admin users")
        else:
            self.log.info("roles base is enforced in CE! ")

    def check_root_certificate(self):
        """ from watson, ce should not see root certificate
            manual test:
            curl -u Administrator:password -X GET
                            http://localhost:8091/pools/default/certificate """
        api = self.rest.baseUrl + "pools/default/certificate"
        try:
            status, content, header = self.rest._http_request(api, 'GET')
        except Exception as ex:
            if ex:
                print(ex)
        if status:
            self.fail("CE should not see root certificate!")
        elif b'requires enterprise edition' in content:
            self.log.info("root certificate is enforced in CE! ")

    def check_settings_audit(self):
        """ from watson, ce should not set audit
            manual test:
            curl -u Administrator:password -X GET
                            http://localhost:8091/settings/audit """
        api = self.rest.baseUrl + "settings/audit"
        try:
            status, content, header = self.rest._http_request(api, 'GET')
        except Exception as ex:
            if ex:
                print(ex)
        if status:
            self.fail("CE should not allow to set audit !")
        elif b'requires enterprise edition' in content:
            self.log.info("settings audit is enforced in CE! ")

    def check_infer(self):
        """ from watson, ce should not see infer
            manual test:
            curl -H "Content-Type: application/json" -X POST
                 -d '{"statement":"infer `bucket_name`;"}'
                       http://localhost:8093/query/service
            test params: new_services=kv-index-n1ql,default_bucket=False """
        self.rest.force_eject_node()
        self.sleep(7, "wait for node reset done")
        self.rest.init_node()
        bucket = "default"
        self.rest.create_bucket(bucket, ramQuotaMB=200)
        api = self.rest.query_baseUrl + "query/service"
        param = urllib.parse.urlencode({"statement": "infer `%s` ;" % bucket})
        try:
            status, content, header = self.rest._http_request(
                api, 'POST', param)
            json_parsed = json.loads(content)
        except Exception as ex:
            if ex:
                print(ex)
        if json_parsed["status"] == "success":
            self.fail("CE should not allow to run INFER !")
        elif json_parsed["status"] == "fatal":
            self.log.info("INFER is enforced in CE! ")

    def check_query_monitoring(self):
        self.rest.force_eject_node()
        self.sleep(7, "wait for node reset done")
        self.rest.init_node()
        bucket = "default"
        self.rest.create_bucket(bucket, ramQuotaMB=200)
        api = self.rest.query_baseUrl + "admin/settings"
        param = {'profile': 'phases'}
        try:
            status, content, header = self.rest._http_request(
                api, 'POST', json.dumps(param))
        except Exception as ex:
            if ex:
                print(ex)
        if status:
            self.fail("CE should not be allowed to do query monitoring !")
        elif b'Profiling is an EE only feature' in content:
            self.log.info("Query monitoring is enforced in CE! ")

    def check_flex_index(self):
        """ from watson, ce should not see infer
            manual test:
            curl -H "Content-Type: application/json" -X POST
                 -d '{"statement":"infer `bucket_name`;"}'
                       http://localhost:8093/query/service
            test params: new_services=kv-index-n1ql,default_bucket=False """
        self.rest.force_eject_node()
        self.sleep(7, "wait for node reset done")
        self.rest.init_node()
        bucket = "default"
        self.rest.create_bucket(bucket, ramQuotaMB=200)
        api = self.rest.query_baseUrl + "query/service"
        param = urllib.parse.urlencode({
            "statement":
            "SELECT META(d).id FROM `%s` AS d USE INDEX (USING FTS) WHERE d.f2 = 100;"
            % bucket
        })
        try:
            status, content, header = self.rest._http_request(
                api, 'POST', param)
            json_parsed = json.loads(content)
        except Exception as ex:
            if ex:
                print(ex)
        if json_parsed["status"] == "success":
            self.fail("CE should not allow to run flex index !")
        elif json_parsed["status"] == "fatal":
            self.log.info("Flex index is enforced in CE! ")

    def check_index_partitioning(self):
        self.rest.force_eject_node()
        self.sleep(7, "wait for node reset done")
        self.rest.init_node()
        bucket = "default"
        self.rest.create_bucket(bucket, ramQuotaMB=200)
        api = self.rest.query_baseUrl + "query/service"
        param = urllib.parse.urlencode({
            "statement":
            "CREATE INDEX idx ON `%s`(id) PARTITION BY HASH(META().id)" %
            bucket
        })
        try:
            status, content, header = self.rest._http_request(
                api, 'POST', param)
            json_parsed = json.loads(content)
        except Exception as ex:
            if ex:
                print(ex)
        if json_parsed["status"] == "success":
            self.fail("CE should not be allowed to run index partitioning !")
        elif json_parsed["status"] == "fatal":
            self.log.info("Index partitioning is enforced in CE! ")

    def check_query_cost_based_optimizer(self):
        self.rest.force_eject_node()
        self.sleep(7, "wait for node reset done")
        self.rest.init_node()
        bucket = "default"
        self.rest.create_bucket(bucket, ramQuotaMB=200)
        api = self.rest.query_baseUrl + "query/service"
        param = urllib.parse.urlencode({
            "statement":
            "UPDATE STATISTICS for `hotel` (type, address, city, country, free_breakfast, id, phone);"
        })
        try:
            status, content, header = self.rest._http_request(
                api, 'POST', param)
            json_parsed = json.loads(content)
        except Exception as ex:
            if ex:
                print(ex)
        if json_parsed["status"] == "success":
            self.fail("CE should not be allowed to run CBO !")
        elif json_parsed["status"] == "fatal":
            self.log.info("CBO is enforced in CE! ")

    def check_query_window_functions(self):
        self.rest.force_eject_node()
        self.sleep(7, "wait for node reset done")
        self.rest.init_node()
        bucket = "default"
        self.rest.create_bucket(bucket, ramQuotaMB=200)
        api = self.rest.query_baseUrl + "query/service"
        param = urllib.parse.urlencode({
            "statement":
            "SELECT d.id, d.destinationairport, CUME_DIST() OVER (PARTITION BY d.destinationairport \
                            ORDER BY d.distance NULLS LAST) AS `rank` \
                            FROM `%s` AS d \
                            WHERE d.type='route' \
                            LIMIT 7;" % bucket
        })
        try:
            status, content, header = self.rest._http_request(
                api, 'POST', param)
            json_parsed = json.loads(content)
        except Exception as ex:
            if ex:
                print(ex)
        if json_parsed["status"] == "success":
            self.fail("CE should not be allowed to use window functions !")
        elif json_parsed["status"] == "fatal":
            self.log.info("Window functions is enforced in CE! ")

    def check_auto_complete(self):
        """ this feature has not complete to block in CE """

    """ Check new features from spock start here """

    def check_cbbackupmgr(self):
        """ cbbackupmgr should not available in CE from spock """
        if self.cb_version[:5] in COUCHBASE_FROM_SPOCK:
            file_name = "cbbackupmgr" + self.file_extension
            self.log.info("check if cbbackupmgr in bin dir in CE")
            result = self.remote.file_exists(self.bin_path, file_name)
            if result:
                self.fail("cbbackupmgr should not in bin dir of CE")
            else:
                self.log.info("cbbackupmgr is enforced in CE")
        self.remote.disconnect()

    def test_max_ttl_bucket(self):
        """
            From vulcan, EE bucket has has an option to set --max-ttl, not it CE.
            This test is make sure CE could not create bucket with option --max-ttl
            This test must pass default_bucket=False
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN:
            self.log.info("This test only for vulcan and later")
            return
        cmd = 'curl -X POST -u Administrator:password \
                                    http://{0}:8091/pools/default/buckets \
                                 -d name=bucket0 \
                                 -d maxTTL=100 \
                                 -d ramQuotaMB=100 '.format(self.master.ip)
        if self.cli_test:
            cmd = "{0}couchbase-cli bucket-create -c {1}:8091 --username Administrator \
                --password password --bucket bucket0 --bucket-type couchbase \
                --bucket-ramsize 512 --bucket-replica 1 --bucket-priority high \
                --bucket-eviction-policy fullEviction --enable-flush 0 \
                --enable-index-replica 1 --max-ttl 200".format(
                self.bin_path, self.master.ip)
        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "Max TTL is supported in enterprise edition only"
        if self.cli_test:
            mesg = "Maximum TTL can only be configured on enterprise edition"
        if output and mesg not in str(output[0]):
            self.fail("max ttl feature should not in Community Edition")
        buckets = RestConnection(self.master).get_buckets()
        if buckets:
            for bucket in buckets:
                self.log.info("bucekt in cluser: {0}".format(bucket.name))
                if bucket.name == "bucket0":
                    self.fail("Failed to enforce feature max ttl in CE.")
        conn.disconnect()

    def test_setting_audit(self):
        """
           CE does not allow to set audit from vulcan 5.5.0
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN:
            self.log.info("This test only for vulcan and later")
            return
        cmd = 'curl -X POST -u Administrator:password \
              http://{0}:8091/settings/audit \
              -d auditdEnabled=true '.format(self.master.ip)
        if self.cli_test:
            cmd = "{0}couchbase-cli setting-audit -c {1}:8091 -u Administrator \
                -p password --audit-enabled 1 --audit-log-rotate-interval 604800 \
                --audit-log-path /opt/couchbase/var/lib/couchbase/logs --set"\
                .format(self.bin_path, self.master.ip)

        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "This http API endpoint requires enterprise edition"
        if output and mesg not in str(output[0]):
            self.fail("setting-audit feature should not in Community Edition")
        conn.disconnect()

    def test_setting_autofailover_enterprise_only(self):
        """
           CE does not allow set auto failover if disk has issue
           and failover group from vulcan 5.5.0
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN:
            self.log.info("This test only for vulcan and later")
            return
        self.failover_disk_period = self.input.param("failover_disk_period",
                                                     False)
        self.failover_server_group = self.input.param("failover_server_group",
                                                      False)

        failover_disk_period = ""
        if self.failover_disk_period:
            if self.cli_test:
                failover_disk_period = "--failover-data-disk-period 300"
            else:
                failover_disk_period = "-d failoverOnDataDiskIssues[timePeriod]=300"
        failover_server_group = ""
        if self.failover_server_group and self.cli_test:
            failover_server_group = "--enable-failover-of-server-group 1"

        cmd = 'curl -X POST -u Administrator:password \
              http://{0}:8091/settings/autoFailover -d enabled=true -d timeout=120 \
              -d maxCount=1 \
              -d failoverOnDataDiskIssues[enabled]=true {1} \
              -d failoverServerGroup={2}'.format(self.master.ip,
                                                 failover_disk_period,
                                                 self.failover_server_group)
        if self.cli_test:
            cmd = "{0}couchbase-cli setting-autofailover -c {1}:8091 \
                   -u Administrator -p password \
                   --enable-failover-on-data-disk-issues 1 {2} {3} "\
                  .format(self.bin_path, self.master.ip,
                          failover_disk_period,
                          failover_server_group)
        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "Auto failover on Data Service disk issues can only be " + \
               "configured on enterprise edition"
        if not self.cli_test:
            if self.failover_disk_period or \
                                   self.failover_server_group:
                if output and not error:
                    self.fail("setting autofailover disk issues feature\
                               should not in Community Edition")
        else:
            if self.failover_server_group:
                mesg = "--enable-failover-of-server-groups can only be " + \
                       "configured on enterprise edition"

        if output and mesg not in str(output[0]):
            self.fail("Setting EE autofailover features \
                       should not in Community Edition")
        else:
            self.log.info("EE setting autofailover are disable in CE")
        conn.disconnect()

    def test_set_bucket_compression(self):
        """
           CE does not allow to set bucket compression to bucket
           from vulcan 5.5.0.   Mode compression: off,active,passive
           Note: must set defaultbucket=False for this test
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN:
            self.log.info("This test only for vulcan and later")
            return
        self.compression_mode = self.input.param("compression_mode", "off")
        cmd = 'curl -X POST -u Administrator:password \
                                    http://{0}:8091/pools/default/buckets \
                                 -d name=bucket0 \
                                 -d compressionMode={1} \
                                 -d ramQuotaMB=100 '.format(
            self.master.ip, self.compression_mode)
        if self.cli_test:
            cmd = "{0}couchbase-cli bucket-create -c {1}:8091 --username Administrator \
                --password password --bucket bucket0 --bucket-type couchbase \
                --bucket-ramsize 512 --bucket-replica 1 --bucket-priority high \
                --bucket-eviction-policy fullEviction --enable-flush 0 \
                --enable-index-replica 1 --compression-mode {2}".format(
                self.bin_path, self.master.ip, self.compression_mode)
        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "Compression mode is supported in enterprise edition only"
        if self.cli_test:
            mesg = "Compression mode can only be configured on enterprise edition"
        if output and mesg not in str(output[0]):
            self.fail("Setting bucket compression should not in CE")
        conn.disconnect()

    def test_ldap_groups(self):
        """
           LDAP Groups feature is not available in CE
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_MAD_HATTER:
            self.log.info("This test is only for MH and later")
            return
        cmd = 'curl -X POST -u Administrator:password \
                                    http://{0}:8091/settings/rbac/groups/admins \
                                 -d roles=admin \
                                 -d description="Couchbase+Server+Administrators" \
                                 --data-urlencode ldap_group_ref="uid=cbadmins,ou=groups,dc=example,dc=com"'\
                                .format(self.master.ip)
        if self.cli_test:
            cmd = '{0}couchbase-cli user-manage -c {1}:8091 --username Administrator \
                --password password  \
                --set-group \
                --group-name admins \
                --roles admin \
                --group-description "Couchbase Server Administrators" \
                --ldap-ref "uid=cbadmins,ou=groups,dc=example,dc=com"'.format(
                self.bin_path, self.master.ip)
        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "Requested resource not found."
        if self.cli_test:
            mesg = "ERROR: This http API endpoint requires enterprise edition"
        if output and mesg not in str(output[0]):
            self.fail("LDAP Groups should not be in CE")
        conn.disconnect()

    def test_ldap_cert(self):
        """
           LDAP Cert feature is not available in CE
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_MAD_HATTER:
            self.log.info("This test is only for MH and later")
            return
        cmd = 'curl -X POST -u Administrator:password http://{0}:8091/settings/ldap \
                                 -d hosts={1} \
                                 -d port=389 \
                                 -d encryption=StartTLSExtension \
                                 -d serverCertValidation=true \
                                 --data-urlencode [email protected] \
                                 -d bindDN="cn=admin,dc=example,dc=com" \
                                 -d bindPass=password \
                                 -d authenticationEnabled=true \
                                 -d authorizationEnabled=true \
                                 --data-urlencode groupsQuery="ou=groups,dc=example,dc=com??one?(member=%D)"'\
                                .format(self.master.ip, self.master.ip)
        if self.cli_test:
            cmd = '{0}couchbase-cli setting-ldap -c {1}:8091 --username Administrator \
                --password password  \
                --authentication-enabled 1 \
                --authorization-enabled 1 \
                --hosts {2} \
                --encryption startTLS \
                --client-cert root.crt \
                --bind-dn "cn=admin,dc=example,dc=com" \
                --bind-password password \
                --group-query "ou=groups,dc=example,dc=com??one?(member=%D)"'.format(
                self.bin_path, self.master.ip, self.master.ip)
        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "This http API endpoint requires enterprise edition"
        if self.cli_test:
            mesg = "ERROR: Command only available in enterprise edition"
        if output and mesg not in str(output[0]):
            self.fail("LDAP Cert should not be in CE")
        conn.disconnect()

    def test_network_encryption(self):
        """
           Encrypted network access is not available in CE
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_MAD_HATTER:
            self.log.info("This test is only for MH and later")
            return
        cmd = 'curl  -u Administrator:password -v -X POST \
                    http://{0}:8091/settings/security \
                    -d disableUIOverHttp=true \
                    -d clusterEncryptionLevel=control \
                    -d tlsMinVersion=tlsv1.1 \
                    -d "cipherSuites=["TLS_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA"]"'\
                                .format(self.master.ip)
        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "not supported in community edition"
        if output and mesg not in str(output[0]):
            self.fail("Encrypted network access should not be in CE")
        conn.disconnect()

    def test_n2n_encryption(self):
        """
           Encrypted network access is not available in CE
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_MAD_HATTER:
            self.log.info("This test is only for MH and later")
            return
        cmd = '/opt/couchbase/bin/couchbase-cli node-to-node-encryption \
                -c http://{0}:8091 \
                -u Administrator \
                -p password \
                --enable'\
                .format(self.master.ip)
        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "not supported in community edition"
        if output and mesg not in str(output[0]):
            self.fail("Encrypted network access should not be in CE")
        conn.disconnect()

    def test_log_redaction(self):
        """
            Log redaction feature is not available in CE
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_MAD_HATTER:
            self.log.info("This test is only for MH and later")
            return
        cmd = 'curl -X POST -u Administrator:password \
                                            http://{0}:8091/controller/startLogsCollection \
                                         -d nodes="*" \
                                         -d logRedactionLevel=partial'.format(
            self.master.ip)
        if self.cli_test:
            cmd = '{0}couchbase-cli collect-logs-start -c {1}:8091 --username Administrator \
                        --password password  \
                        --all-nodes \
                        --redaction-level partial'.format(
                self.bin_path, self.master.ip)
        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "log redaction is an enterprise only feature"
        if output and mesg not in str(output[0]):
            self.fail("Log redaction should not be in CE")
        conn.disconnect()
Ejemplo n.º 39
0
class FailoverTests(FailoverBaseTest):
    def setUp(self):
        super(FailoverTests, self).setUp(self)

    def tearDown(self):
        super(FailoverTests, self).tearDown(self)

    def test_failover_firewall(self):
        self.common_test_body('firewall')

    def test_failover_normal(self):
        self.common_test_body('normal')

    def test_failover_stop_server(self):
        self.common_test_body('stop_server')

    def test_failover_then_add_back(self):
        self.add_back_flag = True
        self.common_test_body('normal')

    def common_test_body(self, failover_reason):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case (before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance
            5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.filter_list = []
        if self.failoverMaster:
            self.master = self.servers[1]
        self.log.info(" Picking node {0} as reference node for test case".format(self.master.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.master)
        self.nodes = self.rest.node_statuses()
        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)):
            self.log.error("Graceful failover can't be applied to nodes with version less then 3.*")
            self.log.error("Please check configuration parameters: SKIPPING TEST.")
            return

        # Find nodes that will under go failover
        if self.failoverMaster:
            self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=1, target_node = self.servers[0])
        else:
            self.chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withMutationOps = True => Run Operations in parallel to failover
        # self.withMutationOps = False => Run Operations Before failover
        self.load_initial_data()
        if not self.withMutationOps:
            self.run_mutation_operations()
        # Perform View Creation Tasks and check for completion if required before failover
        if self.withViewsOps:
            self.run_view_creation_operations(self.servers)
            if not self.createIndexesDuringFailover:
                self.query_and_monitor_view_tasks(self.servers)

        # Take snap-shot of data set used for validaiton
        record_static_data_set ={}
        prev_vbucket_stats = {}
        prev_failover_stats = {}
        if not self.withMutationOps:
            record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path = None)

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets)
            prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets)

        # Perform Operations relalted to failover
        if self.withMutationOps or self.withViewsOps or self.compact:
            self.run_failover_operations_with_ops(self.chosen, failover_reason)
        else:
            self.run_failover_operations(self.chosen, failover_reason)

        # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons
        if not self.gracefulFailoverFail and self.runRebalanceAfterFailover:
            if self.add_back_flag:
                self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats)
            else:
                self.run_rebalance_after_failover_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats)
        else:
            return

        if self.during_ops == None:
            self.verify_unacked_bytes_all_buckets(filter_list = self.filter_list, master_node = self.master)

    def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """ Method to run rebalance after failover and verify """
        # Need a delay > min because MB-7168
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining = True)
        self.sleep(5, "after failover before invoking rebalance...")
        # Rebalance after Failover operation
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[node.id for node in chosen])
        if self.during_ops:
            self.sleep(5, "Wait for some progress in rebalance")
            if self.during_ops == "change_password":
                old_pass = self.master.rest_password
                self.change_password(new_password=self.input.param("new_password", "new_pass"))
                self.rest = RestConnection(self.master)
            elif self.during_ops == "change_port":
                self.change_port(new_port=self.input.param("new_port", "9090"))
                self.rest = RestConnection(self.master)
        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master,bucket)
        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers,chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node = chosen[0])
            self.log.info(" Start Rebalance Again !")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[node.id for node in chosen])

        # Rebalance Monitoring
        msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        # Reset password or port
        if self.during_ops:
            if self.during_ops == "change_password":
                self.change_password(new_password=old_pass)
            elif self.during_ops == "change_port":
                self.change_port(new_port='8091',
                current_port=self.input.param("new_port", "9090"))
            return

        #  Drain Queue and make sure intra-cluster replication is complete
        self.log.info("Begin VERIFICATION for Rebalance after Failover Only")
        self.verify_cluster_stats(_servers_, self.master, check_bucket_stats = True, check_ep_items_remaining = True)
        # Verify all data set with meta data if failover happens after failover
        if not self.withMutationOps:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path = None, addedItems = None)

        # Check Cluster Stats and Data as well if max_verify > 0
        # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed
        # Currently, only  for checking case where we  have graceful failover
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets)
            new_vbucket_stats =  self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)
        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets)
        self.log.info("End VERIFICATION for Rebalance after Failover Only")

    def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """
            Method to run add-back operation with recovery type = (delta/full)
            It also verifies if the operations are correct with data verificaiton steps
        """
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining = True)
        serverMap =  self.get_server_map(self.servers)
        recoveryTypeMap = self.define_maps_during_failover(self.recoveryType)
        fileMapsForVerification = self.create_file(chosen, self.buckets, serverMap)
        index = 0
        for node in chosen:
            self.rest.add_back_node(node.id)
            self.sleep(5)
            if self.recoveryType:
                # define precondition for recoverytype
                self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index])
                index += 1
        self.sleep(20, "After failover before invoking rebalance...")
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[],deltaRecoveryBuckets = self.deltaRecoveryBuckets)

        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master,bucket)

        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers,chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node = chosen[0])
            self.log.info(" Start Rebalance Again !")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],ejectedNodes=[],deltaRecoveryBuckets = self.deltaRecoveryBuckets)

        # Check if node has to be killed or restarted during rebalance
        # Monitor Rebalance
        msg = "rebalance failed while removing failover nodes {0}".format(chosen)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        #  Drain ep_queue and make sure that intra-cluster replication is complete
        self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining = True)

        self.log.info("Begin VERIFICATION for Add-back and rebalance")

        # Verify Stats of cluster and Data is max_verify > 0
        self.verify_cluster_stats(self.servers, self.master, check_bucket_stats = True, check_ep_items_remaining = True)

        # Verify recovery Type succeeded if we added-back nodes
        self.verify_for_recovery_type(chosen, serverMap, self.buckets,recoveryTypeMap, fileMapsForVerification, self.deltaRecoveryBuckets)

        # Comparison of all data if required
        if not self.withMutationOps:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set,self.servers, self.buckets,  path = None, addedItems = None)

        # Verify if vbucket sequence numbers and failover logs are as expected
        # We will check only for version  > 2.5.* and if the failover is graceful
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets,perNode= False)
            new_failover_stats = self.compare_failovers_logs(prev_failover_stats,self.servers,self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets)

        self.log.info("End VERIFICATION for Add-back and rebalance")

    def print_test_params(self, failover_reason):
        """ Method to print test parameters """
        self.log.info("num_replicas : {0}".format(self.num_replicas))
        self.log.info("recoveryType : {0}".format(self.recoveryType))
        self.log.info("failover_reason : {0}".format(failover_reason))
        self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes))
        self.log.info('picking server : {0} as the master'.format(self.master))

    def run_failover_operations(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        graceful_count = 0
        graceful_failover = True
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable=True
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                unreachable=True
                self.filter_list.append (node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")
            # verify the failover type
            if self.check_verify_failover_type:
                graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable)
            # define precondition check for failover
            success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
            if self.graceful and graceful_failover:
                if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes:
                    self.victim_node_operations(node)
                    # Start Graceful Again
                    self.log.info(" Start Graceful Failover Again !")
                    success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
                    msg = "graceful failover failed for nodes {0}".format(node.id)
                    self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
                else:
                    msg = "rebalance failed while removing failover nodes {0}".format(node.id)
                    self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
            failed_over = failed_over and success_failed_over

        # Check for negative cases
        if self.graceful and (failover_reason in ['stop_server', 'firewall']):
            if failed_over:
                # MB-10479
                self.rest.print_UI_logs()
            self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ")
            return
        elif self.gracefulFailoverFail and not failed_over:
            """ Check if the fail_over fails as expected """
            self.assertFalse(failed_over,""" Graceful failover should fail due to not enough replicas """)
            return

        # Check if failover happened as expected or re-try one more time
        if not failed_over:
            self.log.info("unable to failover the node the first time. try again in  60 seconds..")
            # try again in 75 seconds
            self.sleep(75)
            failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
        if self.graceful and (failover_reason not in ['stop_server', 'firewall']):
            reached = RestHelper(self.rest).rebalance_reached()
            self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed")

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.filter_servers(self.servers,chosen)
            self.vb_distribution_analysis(servers = nodes, buckets = self.buckets, std = 20.0 , total_vbuckets = self.total_vbuckets, type = "failover", graceful = (self.graceful and graceful_failover) )

    def run_failover_operations_with_ops(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable=True
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                unreachable=True
                self.filter_list.append (node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")
        nodes = self.filter_servers(self.servers,chosen)
        failed_over = self.cluster.async_failover([self.master], failover_nodes = chosen, graceful=self.graceful)
        # Perform Compaction
        compact_tasks = []
        if self.compact:
            for bucket in self.buckets:
                compact_tasks.append(self.cluster.async_compact_bucket(self.master,bucket))
        # Run View Operations
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)
        # Run mutation operations
        if self.withMutationOps:
            self.run_mutation_operations()
        failed_over.result()
        for task in compact_tasks:
            task.result()
        msg = "rebalance failed while removing failover nodes {0}".format(node.id)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)


    def load_initial_data(self):
        """ Method to run operations Update/Delete/Create """
        # Load All Buckets if num_items > 0
        tasks = []
        tasks += self._async_load_all_buckets(self.master, self.gen_initial_create, "create", 0, flag = 2, batch_size=20000)
        for task in tasks:
            task.result()
        self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining = True)
        self._verify_stats_all_buckets(self.servers,timeout = 120)

    def run_mutation_operations(self):
        mutation_ops_tasks = []
        if("create" in self.doc_ops):
            mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_create, "create", 0)
        if("update" in self.doc_ops):
            mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_update, "update", 0)
        if("delete" in self.doc_ops):
            mutation_ops_tasks += self._async_load_all_buckets(self.master, self.gen_delete, "delete", 0)
        try:
            for task in mutation_ops_tasks:
                    task.result()
        except Exception, ex:
            self.log.info(ex)
Ejemplo n.º 40
0
 def build_info(node):
     rest = RestConnection(node)
     api = rest.baseUrl + 'nodes/self'
     status, content, header = rest._http_request(api)
     json_parsed = json.loads(content)
     return json_parsed
Ejemplo n.º 41
0
 def _get_cluster_ca_cert(self):
     rest = RestConnection(self.host)
     api = rest.baseUrl + "pools/default/certificate?extended=true"
     status, content, header = rest._http_request(api, 'GET')
     return status, content, header
Ejemplo n.º 42
0
class CommunityTests(CommunityBaseTest):
    def setUp(self):
        super(CommunityTests, self).setUp()
        self.command = self.input.param("command", "")
        self.zone = self.input.param("zone", 1)
        self.replica = self.input.param("replica", 1)
        self.command_options = self.input.param("command_options", '')
        self.set_get_ratio = self.input.param("set_get_ratio", 0.9)
        self.item_size = self.input.param("item_size", 128)
        self.shutdown_zone = self.input.param("shutdown_zone", 1)
        self.do_verify = self.input.param("do-verify", True)
        self.num_node = self.input.param("num_node", 4)
        self.services = self.input.param("services", None)
        self.start_node_services = self.input.param("start_node_services",
                                                    "kv")
        self.add_node_services = self.input.param("add_node_services", "kv")
        self.timeout = 6000
        self.user_add = self.input.param("user_add", None)
        self.user_role = self.input.param("user_role", None)

    def tearDown(self):
        super(CommunityTests, self).tearDown()

    def test_disabled_zone(self):
        disabled_zone = False
        zone_name = "group1"
        serverInfo = self.servers[0]
        self.rest = RestConnection(serverInfo)
        try:
            self.log.info("create zone name 'group1'!")
            result = self.rest.add_zone(zone_name)
            print("result  ", result)
        except Exception as e:
            if e:
                print(e)
                disabled_zone = True
                pass
        if not disabled_zone:
            self.fail("CE version should not have zone feature")

    def check_audit_available(self):
        audit_available = False
        try:
            self.rest.getAuditSettings()
            audit_available = True
        except Exception as e:
            if e:
                print(e)
        if audit_available:
            self.fail("This feature 'audit' only available on "
                      "Enterprise Edition")

    def check_ldap_available(self):
        ldap_available = False
        self.rest = RestConnection(self.master)
        try:
            s, c, h = self.rest.clearLDAPSettings()
            if s:
                ldap_available = True
        except Exception as e:
            if e:
                print(e)
        if ldap_available:
            self.fail("This feature 'ldap' only available on "
                      "Enterprise Edition")

    def check_set_services(self):
        self.rest.force_eject_node()
        self.sleep(7, "wait for node reset done")
        try:
            status = self.rest.init_node_services(hostname=self.master.ip,
                                                  services=[self.services])
        except Exception as e:
            if e:
                print(e)
        if self.services == "kv":
            if status:
                self.log.info("CE could set {0} only service.".format(
                    self.services))
            else:
                self.fail("Failed to set {0} only service.".format(
                    self.services))
        elif self.services == "index,kv":
            if status:
                self.fail("CE does not support kv and index on same node")
            else:
                self.log.info("services enforced in CE")
        elif self.services == "kv,n1ql":
            if status:
                self.fail("CE does not support kv and n1ql on same node")
            else:
                self.log.info("services enforced in CE")
        elif self.services == "kv,eventing":
            if status:
                self.fail("CE does not support kv and eventing on same node")
            else:
                self.log.info("services enforced in CE")
        elif self.services == "index,n1ql":
            if status:
                self.fail("CE does not support index and n1ql on same node")
            else:
                self.log.info("services enforced in CE")
        elif self.services == "index,kv,n1ql":
            if status:
                self.log.info(
                    "CE could set all services {0} on same nodes.".format(
                        self.services))
            else:
                self.fail("Failed to set kv, index and query services on CE")
        elif self.version[:5] in COUCHBASE_FROM_WATSON:
            if self.version[:
                            5] in COUCHBASE_FROM_VULCAN and "eventing" in self.services:
                if status:
                    self.fail("CE does not support eventing in vulcan")
                else:
                    self.log.info("services enforced in CE")
            elif self.services == "fts,index,kv":
                if status:
                    self.fail(
                        "CE does not support fts, index and kv on same node")
                else:
                    self.log.info("services enforced in CE")
            elif self.services == "fts,index,n1ql":
                if status:
                    self.fail(
                        "CE does not support fts, index and n1ql on same node")
                else:
                    self.log.info("services enforced in CE")
            elif self.services == "fts,kv,n1ql":
                if status:
                    self.fail(
                        "CE does not support fts, kv and n1ql on same node")
                else:
                    self.log.info("services enforced in CE")
            elif self.services == "fts,index,kv,n1ql":
                if status:
                    self.log.info(
                        "CE could set all services {0} on same nodes.".format(
                            self.services))
                else:
                    self.fail("Failed to set "
                              "fts, index, kv, and query services on CE")
        else:
            self.fail("some services don't support")

    def check_set_services_when_add_node(self):
        self.rest.force_eject_node()
        sherlock_services_in_ce = ["kv", "index,kv,n1ql"]
        watson_services_in_ce = ["kv", "index,kv,n1ql", "fts,index,kv,n1ql"]
        self.sleep(5, "wait for node reset done")
        try:
            self.log.info("Initialize node with services {0}".format(
                self.start_node_services))
            status = self.rest.init_node_services(
                hostname=self.master.ip, services=[self.start_node_services])
            init_node = self.cluster.async_init_node(
                self.master, services=[self.start_node_services])
        except Exception as e:
            if e:
                print(e)
        if not status:
            if self.version not in COUCHBASE_FROM_WATSON and \
                         self.start_node_services not in sherlock_services_in_ce:
                self.log.info(
                    "initial services setting enforced in Sherlock CE")
            elif self.version in COUCHBASE_FROM_WATSON and \
                         self.start_node_services not in watson_services_in_ce:
                self.log.info("initial services setting enforced in Watson CE")

        elif status and init_node.result() != 0:
            add_node = False
            try:
                self.log.info("node with services {0} try to add".format(
                    self.add_node_services))
                add_node = self.cluster.rebalance(
                    self.servers[:2],
                    self.servers[1:2], [],
                    services=[self.add_node_services])
            except Exception:
                pass
            if add_node:
                self.get_services_map()
                list_nodes = self.get_nodes_from_services_map(
                    get_all_nodes=True)
                map = self.get_nodes_services()
                if map[self.master.ip] == self.start_node_services and \
                    map[self.servers[1].ip] == self.add_node_services:
                    self.log.info(
                        "services set correctly when node added & rebalance")
                else:
                    self.fail("services set incorrectly when node added & rebalance. "
                        "cluster expected services: {0}; set cluster services {1} ."
                        "add node expected srv: {2}; set add node srv {3}"\
                        .format(map[self.master.ip], self.start_node_services, \
                         map[self.servers[1].ip], self.add_node_services))
            else:
                if self.version not in COUCHBASE_FROM_WATSON:
                    if self.start_node_services in ["kv", "index,kv,n1ql"] and \
                          self.add_node_services not in ["kv", "index,kv,n1ql"]:
                        self.log.info("services are enforced in CE")
                    elif self.start_node_services not in [
                            "kv", "index,kv,n1ql"
                    ]:
                        self.log.info("services are enforced in CE")
                    else:
                        self.fail("maybe bug in add node")
                elif self.version in COUCHBASE_FROM_WATSON:
                    if self.start_node_services in ["kv", "index,kv,n1ql",
                         "fts,index,kv,n1ql"] and self.add_node_services not in \
                                    ["kv", "index,kv,n1ql", "fts,index,kv,n1ql"]:
                        self.log.info("services are enforced in CE")
                    elif self.start_node_services not in [
                            "kv", "index,kv,n1ql", "fts,index,kv,n1ql"
                    ]:
                        self.log.info("services are enforced in CE")
                    else:
                        self.fail("maybe bug in add node")
        else:
            self.fail("maybe bug in node initialization")

    def check_full_backup_only(self):
        """ for windows vm, ask IT to put uniq.exe at
            /cygdrive/c/Program Files (x86)/ICW/bin directory """

        self.remote = RemoteMachineShellConnection(self.master)
        """ put params items=0 in test param so that init items = 0 """
        self.remote.execute_command("{0}cbworkloadgen -n {1}:8091 -j -i 1000 " \
                                    "-u Administrator -p password" \
                                            .format(self.bin_path, self.master.ip))
        """ delete backup location before run backup """
        self.remote.execute_command("rm -rf {0}*".format(self.backup_location))
        output, error = self.remote.execute_command("ls -lh {0}".format(
            self.backup_location))
        self.remote.log_command_output(output, error)
        """ first full backup """
        self.remote.execute_command("{0}cbbackup http://{1}:8091 {2} -m full " \
                                    "-u Administrator -p password"\
                                    .format(self.bin_path,
                                            self.master.ip,
                                            self.backup_c_location))
        output, error = self.remote.execute_command("ls -lh {0}*/".format(
            self.backup_location))
        self.remote.log_command_output(output, error)
        output, error = self.remote.execute_command("{0}cbtransfer -u Administrator "\
                                           "-p password {1}*/*-full/ " \
                                           "stdout: | grep set | uniq | wc -l"\
                                           .format(self.bin_path,
                                                   self.backup_c_location))
        self.remote.log_command_output(output, error)
        if int(output[0]) != 1000:
            self.fail("full backup did not work in CE. "
                      "Expected 1000, actual: {0}".format(output[0]))
        self.remote.execute_command("{0}cbworkloadgen -n {1}:8091 -j -i 1000 "\
                                    " -u Administrator -p password --prefix=t_"
                                    .format(self.bin_path, self.master.ip))
        """ do different backup mode """
        self.remote.execute_command("{0}cbbackup -u Administrator -p password "\
                                    "http://{1}:8091 {2} -m {3}"\
                                    .format(self.bin_path,
                                            self.master.ip,
                                            self.backup_c_location,
                                            self.backup_option))
        output, error = self.remote.execute_command("ls -lh {0}".format(
            self.backup_location))
        self.remote.log_command_output(output, error)
        output, error = self.remote.execute_command("{0}cbtransfer -u Administrator "\
                                           "-p password {1}*/*-{2}/ stdout: "\
                                           "| grep set | uniq | wc -l"\
                                           .format(self.bin_path,
                                                   self.backup_c_location,
                                                   self.backup_option))
        self.remote.log_command_output(output, error)
        if int(output[0]) == 2000:
            self.log.info("backup option 'diff' is enforced in CE")
        elif int(output[0]) == 1000:
            self.fail("backup option 'diff' is not enforced in CE. "
                      "Expected 2000, actual: {0}".format(output[0]))
        else:
            self.fail("backup failed to backup correct items")
        self.remote.disconnect()

    def check_ent_backup(self):
        """ for CE version from Watson, cbbackupmgr exe file should not in bin """
        command = "cbbackupmgr"
        self.remote = RemoteMachineShellConnection(self.master)
        self.log.info("check if {0} in {1} directory".format(
            command, self.bin_path))
        found = self.remote.file_exists(self.bin_path, command)
        if found:
            self.log.info("found {0} in {1} directory".format(
                command, self.bin_path))
            self.fail("CE from Watson should not contain {0}".format(command))
        elif not found:
            self.log.info("Ent. backup in CE is enforced, not in bin!")
        self.remote.disconnect()

    def check_memory_optimized_storage_mode(self):
        """ from Watson, CE should not have option 'memory_optimized' to set """
        self.rest.force_eject_node()
        self.sleep(5, "wait for node reset done")
        try:
            self.log.info("Initialize node with 'Memory Optimized' option")
            status = self.rest.set_indexer_storage_mode(
                username=self.input.membase_settings.rest_username,
                password=self.input.membase_settings.rest_password,
                storageMode='memory_optimized')
        except Exception as ex:
            if ex:
                print(ex)
        if not status:
            self.log.info("Memory Optimized setting enforced in CE "
                          "Could not set memory_optimized option")
        else:
            self.fail("Memory Optimzed setting does not enforced in CE "
                      "We could set this option in")

    def check_x509_cert(self):
        """ from Watson, X509 certificate only support in EE """
        api = self.rest.baseUrl + "pools/default/certificate?extended=true"
        self.log.info("request to get certificate at "
                      "'pools/default/certificate?extended=true' "
                      "should return False")
        try:
            status, content, header = self.rest._http_request(api, 'GET')
        except Exception as ex:
            if ex:
                print(ex)
        if status:
            self.fail("This X509 certificate feature only available in EE")
        elif not status:
            if "requires enterprise edition" in content:
                self.log.info("X509 cert is enforced in CE")

    def check_roles_base_access(self):
        """ from Watson, roles base access for admin should not in in CE """
        if self.user_add is None:
            self.fail(
                "We need to pass user name (user_add) to run this test. ")
        if self.user_role is None:
            self.fail(
                "We need to pass user roles (user_role) to run this test. ")
        api = self.rest.baseUrl + "settings/rbac/users/" + self.user_add
        self.log.info("url to run this test: %s" % api)
        """ add admin user """
        param = "name=%s&roles=%s" % (self.user_add, self.user_role)
        try:
            status, content, header = self.rest._http_request(
                api, 'PUT', param)
        except Exception as ex:
            if ex:
                print(ex)
        if status:
            self.fail("CE should not allow to add admin users")
        else:
            self.log.info("roles base is enforced in CE! ")

    def check_root_certificate(self):
        """ from watson, ce should not see root certificate
            manual test:
            curl -u Administrator:password -X GET
                            http://localhost:8091/pools/default/certificate """
        api = self.rest.baseUrl + "pools/default/certificate"
        try:
            status, content, header = self.rest._http_request(api, 'GET')
        except Exception as ex:
            if ex:
                print(ex)
        if status:
            self.fail("CE should not see root certificate!")
        elif "requires enterprise edition" in content:
            self.log.info("root certificate is enforced in CE! ")

    def check_settings_audit(self):
        """ from watson, ce should not set audit
            manual test:
            curl -u Administrator:password -X GET
                            http://localhost:8091/settings/audit """
        api = self.rest.baseUrl + "settings/audit"
        try:
            status, content, header = self.rest._http_request(api, 'GET')
        except Exception as ex:
            if ex:
                print(ex)
        if status:
            self.fail("CE should not allow to set audit !")
        elif "requires enterprise edition" in content:
            self.log.info("settings audit is enforced in CE! ")

    def check_infer(self):
        """ from watson, ce should not see infer
            manual test:
            curl -H "Content-Type: application/json" -X POST
                 -d '{"statement":"infer `bucket_name`;"}'
                       http://localhost:8093/query/service
            test params: new_services=kv-index-n1ql,default_bucket=False """
        self.rest.force_eject_node()
        self.sleep(7, "wait for node reset done")
        self.rest.init_node()
        bucket = "default"
        self.rest.create_bucket(bucket, ramQuotaMB=200)
        api = self.rest.query_baseUrl + "query/service"
        param = urllib.parse.urlencode({"statement": "infer `%s` ;" % bucket})
        try:
            status, content, header = self.rest._http_request(
                api, 'POST', param)
            json_parsed = json.loads(content)
        except Exception as ex:
            if ex:
                print(ex)
        if json_parsed["status"] == "success":
            self.fail("CE should not allow to run INFER !")
        elif json_parsed["status"] == "fatal":
            self.log.info("INFER is enforced in CE! ")

    def check_auto_complete(self):
        """ this feature has not complete to block in CE """

    """ Check new features from spock start here """

    def check_cbbackupmgr(self):
        """ cbbackupmgr should not available in CE from spock """
        if self.cb_version[:5] in COUCHBASE_FROM_SPOCK:
            file_name = "cbbackupmgr" + self.file_extension
            self.log.info("check if cbbackupmgr in bin dir in CE")
            result = self.remote.file_exists(self.bin_path, file_name)
            if result:
                self.fail("cbbackupmgr should not in bin dir of CE")
            else:
                self.log.info("cbbackupmgr is enforced in CE")
        self.remote.disconnect()

    def test_max_ttl_bucket(self):
        """
            From vulcan, EE bucket has has an option to set --max-ttl, not it CE.
            This test is make sure CE could not create bucket with option --max-ttl
            This test must pass default_bucket=False
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN:
            self.log.info("This test only for vulcan and later")
            return
        cmd = 'curl -X POST -u Administrator:password \
                                    http://{0}:8091/pools/default/buckets \
                                 -d name=bucket0 \
                                 -d maxTTL=100 \
                                 -d authType=sasl \
                                 -d ramQuotaMB=100 '.format(self.master.ip)
        if self.cli_test:
            cmd = "{0}couchbase-cli bucket-create -c {1}:8091 --username Administrator \
                --password password --bucket bucket0 --bucket-type couchbase \
                --bucket-ramsize 512 --bucket-replica 1 --bucket-priority high \
                --bucket-eviction-policy fullEviction --enable-flush 0 \
                --enable-index-replica 1 --max-ttl 200".format(
                self.bin_path, self.master.ip)
        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "Max TTL is supported in enterprise edition only"
        if self.cli_test:
            mesg = "Maximum TTL can only be configured on enterprise edition"
        if output and mesg not in str(output[0]):
            self.fail("max ttl feature should not in Community Edition")
        buckets = RestConnection(self.master).get_buckets()
        if buckets:
            for bucket in buckets:
                self.log.info("bucekt in cluser: {0}".format(bucket.name))
                if bucket.name == "bucket0":
                    self.fail("Failed to enforce feature max ttl in CE.")
        conn.disconnect()

    def test_setting_audit(self):
        """
           CE does not allow to set audit from vulcan 5.5.0
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN:
            self.log.info("This test only for vulcan and later")
            return
        cmd = 'curl -X POST -u Administrator:password \
              http://{0}:8091/settings/audit \
              -d auditdEnabled=true '.format(self.master.ip)
        if self.cli_test:
            cmd = "{0}couchbase-cli setting-audit -c {1}:8091 -u Administrator \
                -p password --audit-enabled 1 --audit-log-rotate-interval 604800 \
                --audit-log-path /opt/couchbase/var/lib/couchbase/logs "\
                .format(self.bin_path, self.master.ip)

        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "This http API endpoint requires enterprise edition"
        if output and mesg not in str(output[0]):
            self.fail("setting-audit feature should not in Community Edition")
        conn.disconnect()

    def test_setting_autofailover_enterprise_only(self):
        """
           CE does not allow set auto failover if disk has issue
           and failover group from vulcan 5.5.0
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN:
            self.log.info("This test only for vulcan and later")
            return
        self.failover_disk_period = self.input.param("failover_disk_period",
                                                     False)
        self.failover_server_group = self.input.param("failover_server_group",
                                                      False)

        failover_disk_period = ""
        if self.failover_disk_period:
            if self.cli_test:
                failover_disk_period = "--failover-data-disk-period 300"
            else:
                failover_disk_period = "-d failoverOnDataDiskIssues[timePeriod]=300"
        failover_server_group = ""
        if self.failover_server_group and self.cli_test:
            failover_server_group = "--enable-failover-of-server-group 1"

        cmd = 'curl -X POST -u Administrator:password \
              http://{0}:8091/settings/autoFailover -d enabled=true -d timeout=120 \
              -d maxCount=1 \
              -d failoverOnDataDiskIssues[enabled]=true {1} \
              -d failoverServerGroup={2}'.format(self.master.ip,
                                                 failover_disk_period,
                                                 self.failover_server_group)
        if self.cli_test:
            cmd = "{0}couchbase-cli setting-autofailover -c {1}:8091 \
                   -u Administrator -p password \
                   --enable-failover-on-data-disk-issues 1 {2} {3} "\
                  .format(self.bin_path, self.master.ip,
                          failover_disk_period,
                          failover_server_group)
        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "Auto failover on Data Service disk issues can only be " + \
               "configured on enterprise edition"
        if not self.cli_test:
            if self.failover_disk_period or \
                                   self.failover_server_group:
                if output and not error:
                    self.fail("setting autofailover disk issues feature\
                               should not in Community Edition")
        else:
            if self.failover_server_group:
                mesg = "--enable-failover-of-server-groups can only be " + \
                       "configured on enterprise edition"

        if output and mesg not in str(output[0]):
            self.fail("Setting EE autofailover features \
                       should not in Community Edition")
        else:
            self.log.info("EE setting autofailover are disable in CE")
        conn.disconnect()

    def test_set_bucket_compression(self):
        """
           CE does not allow to set bucket compression to bucket
           from vulcan 5.5.0.   Mode compression: off,active,passive
           Note: must set defaultbucket=False for this test
        """
        if self.cb_version[:5] not in COUCHBASE_FROM_VULCAN:
            self.log.info("This test only for vulcan and later")
            return
        self.compression_mode = self.input.param("compression_mode", "off")
        cmd = 'curl -X POST -u Administrator:password \
                                    http://{0}:8091/pools/default/buckets \
                                 -d name=bucket0 \
                                 -d compressionMode={1} \
                                 -d authType=sasl \
                                 -d ramQuotaMB=100 '.format(
            self.master.ip, self.compression_mode)
        if self.cli_test:
            cmd = "{0}couchbase-cli bucket-create -c {1}:8091 --username Administrator \
                --password password --bucket bucket0 --bucket-type couchbase \
                --bucket-ramsize 512 --bucket-replica 1 --bucket-priority high \
                --bucket-eviction-policy fullEviction --enable-flush 0 \
                --enable-index-replica 1 --compression-mode {2}".format(
                self.bin_path, self.master.ip, self.compression_mode)
        conn = RemoteMachineShellConnection(self.master)
        output, error = conn.execute_command(cmd)
        conn.log_command_output(output, error)
        mesg = "Compression mode is supported in enterprise edition only"
        if self.cli_test:
            mesg = "Compression mode can only be configured on enterprise edition"
        if output and mesg not in str(output[0]):
            self.fail("Setting bucket compression should not in CE")
        conn.disconnect()
Ejemplo n.º 43
0
 def _get_cluster_ca_cert(self):
     rest = RestConnection(self.host)
     api = rest.baseUrl + "pools/default/certificate?extended=true"
     status, content, header = rest._http_request(api, 'GET')
     return status, content, header
Ejemplo n.º 44
0
class FailoverTests(FailoverBaseTest):
    def setUp(self):
        super(FailoverTests, self).setUp()
        self.server_map = self.get_server_map(self.servers)

    def tearDown(self):
        super(FailoverTests, self).tearDown()

    def test_failover_firewall(self):
        self.common_test_body('firewall')

    def test_failover_normal(self):
        self.common_test_body('normal')

    def test_failover_stop_server(self):
        self.common_test_body('stop_server')

    def test_failover_then_add_back(self):
        self.add_back_flag = True
        self.common_test_body('normal')

    def common_test_body(self, failover_reason):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case(before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARD/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance
            5. Verify all expected operations completed by checking stats,
               replication, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.filter_list = []
        if self.failoverMaster:
            self.master = self.servers[1]
        self.log.info(
            "Picking node {0} as reference node for test case".format(
                self.master.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.master)
        self.nodes = self.rest.node_statuses()
        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 and (self.graceful or
                                                  (self.recoveryType != None)):
            self.log.error(
                "Graceful failover can't be applied to nodes with version less then 3.*"
            )
            self.log.error(
                "Please check configuration parameters: SKIPPING TEST.")
            return

        # Find nodes that will under go failover
        if self.failoverMaster:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=1, target_node=self.servers[0])
        else:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withMutationOps = True => Run Operations in parallel to failover
        # self.withMutationOps = False => Run Operations Before failover
        self.load_initial_data()
        if not self.withMutationOps:
            self.run_mutation_operations()
        # Perform view creation tasks and wait for completion before failover
        if self.withViewsOps:
            self.run_view_creation_operations(self.servers)
            if not self.createIndexesDuringFailover:
                self.query_and_monitor_view_tasks(self.servers)

        # Validate seq_no snap_start/stop values
        self.check_snap_start_corruption()

        # Take snap-shot of data set used for validaiton
        record_static_data_set = dict()
        prev_vbucket_stats = dict()
        prev_failover_stats = dict()
        if not self.withMutationOps:
            record_static_data_set = self.get_data_set_all(self.servers,
                                                           self.buckets,
                                                           path=None)

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.get_vbucket_seqnos(
                self.servers, self.buckets)
            prev_failover_stats = self.get_failovers_logs(
                self.servers, self.buckets)

        # Perform Operations related to failover
        if self.withMutationOps or self.withViewsOps or self.compact:
            self.run_failover_operations_with_ops(self.chosen, failover_reason)
        else:
            self.run_failover_operations(self.chosen, failover_reason)

        # TODO: Enable this even when 'flusher_batch_split_trigger' is not set
        if self.flusher_batch_split_trigger and \
                self.num_replicas >= self.num_failed_nodes:
            tasks = self._async_load_all_buckets(self.master, self.gen_update,
                                                 "update", 0)
            for task in tasks:
                task.result()

        if self.graceful:
            # Validate seq_no snap_start/stop values
            self.check_snap_start_corruption()

        # Add back + rebalance / only rebalance with verification
        if not self.gracefulFailoverFail and self.runRebalanceAfterFailover:
            if self.add_back_flag:
                self.run_add_back_operation_and_verify(self.chosen,
                                                       prev_vbucket_stats,
                                                       record_static_data_set,
                                                       prev_failover_stats)
            else:
                self.run_rebalance_after_failover_and_verify(
                    self.chosen, prev_vbucket_stats, record_static_data_set,
                    prev_failover_stats)

        if self.graceful:
            # Validate seq_no snap_start/stop values
            self.check_snap_start_corruption()

        if self.during_ops is None:
            self.verify_unacked_bytes_all_buckets(filter_list=self.filter_list,
                                                  master_node=self.master)

    def run_rebalance_after_failover_and_verify(self, chosen,
                                                prev_vbucket_stats,
                                                record_static_data_set,
                                                prev_failover_stats):
        """ Method to run rebalance after failover and verify """
        # Need a delay > min because MB-7168
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_,
                                         check_ep_items_remaining=True)
        self.sleep(5, "after failover before invoking rebalance...")
        # Rebalance after Failover operation
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[node.id for node in chosen])
        if self.during_ops:
            self.sleep(5, "Wait for some progress in rebalance")
            if self.during_ops == "change_password":
                old_pass = self.master.rest_password
                self.change_password(
                    new_password=self.input.param("new_password", "new_pass"))
                self.rest = RestConnection(self.master)
            elif self.during_ops == "change_port":
                self.change_port(new_port=self.input.param("new_port", "9090"))
                self.rest = RestConnection(self.master)
        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master, bucket)
        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers, chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node=chosen[0])
            self.sleep(60)
            self.log.info(" Start Rebalance Again !")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                                ejectedNodes=[node.id for node in chosen])

        # Rebalance Monitoring
        msg = "rebalance failed while removing failover nodes {0}".format(
            [node.id for node in chosen])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        # Reset password or port
        if self.during_ops:
            if self.during_ops == "change_password":
                self.change_password(new_password=old_pass)
            elif self.during_ops == "change_port":
                self.change_port(new_port='8091',
                                 current_port=self.input.param(
                                     "new_port", "9090"))
            return

        #  Drain Queue and make sure intra-cluster replication is complete
        self.log.info("Begin VERIFICATION for Rebalance after Failover Only")
        self.verify_cluster_stats(_servers_,
                                  self.master,
                                  check_bucket_stats=True,
                                  check_ep_items_remaining=True)
        # Verify all data set with meta data if failover happens after failover
        if not self.withMutationOps:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set,
                                   _servers_,
                                   self.buckets,
                                   path=None,
                                   addedItems=None)

        # Check Cluster Stats and Data as well if max_verify > 0
        # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed
        # Currently, only  for checking case where we  have graceful failover
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_failover_stats = self.compare_failovers_logs(
                prev_failover_stats, _servers_, self.buckets)
            new_vbucket_stats = self.compare_vbucket_seqnos(
                prev_vbucket_stats, _servers_, self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats,
                                                 new_failover_stats)
        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers=nodes,
                                          buckets=self.buckets,
                                          std=20.0,
                                          total_vbuckets=self.total_vbuckets)
        self.log.info("End VERIFICATION for Rebalance after Failover Only")

    def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats,
                                          record_static_data_set,
                                          prev_failover_stats):
        """
            Method to run add-back operation with recovery type = (delta/full)
            It also verifies if the operations are correct with data verificaiton steps
        """
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_,
                                         check_ep_items_remaining=True)
        recoveryTypeMap = self.define_maps_during_failover(self.recoveryType)
        fileMapsForVerification = self.create_file(chosen, self.buckets,
                                                   self.server_map)
        index = 0
        for node in chosen:
            self.sleep(5)
            if self.recoveryType:
                # define precondition for recoverytype
                self.rest.set_recovery_type(
                    otpNode=node.id, recoveryType=self.recoveryType[index])
                index += 1
            else:
                self.rest.add_back_node(node.id)

        # Doc_mutation before triggering rebalance
        tasks = self._async_load_all_buckets(self.master, self.gen_update,
                                             "update", 0)
        for task in tasks:
            task.result()

        self.sleep(20, "After failover before invoking rebalance...")
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[],
                            deltaRecoveryBuckets=self.deltaRecoveryBuckets)

        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master, bucket)

        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers, chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node=chosen[0])
            self.sleep(60)
            self.log.info("Start Rebalance Again!")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                                ejectedNodes=[],
                                deltaRecoveryBuckets=self.deltaRecoveryBuckets)

        # Check if node has to be killed or restarted during rebalance
        # Monitor Rebalance
        msg = "rebalance failed while removing failover nodes {0}".format(
            chosen)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        # Drain ep_queue and make sure that intra-cluster replication is complete
        self._wait_for_stats_all_buckets(self.servers,
                                         check_ep_items_remaining=True)

        self.log.info("Begin VERIFICATION for Add-back and rebalance")

        # Verify Stats of cluster and Data is max_verify > 0
        self.verify_cluster_stats(self.servers,
                                  self.master,
                                  check_bucket_stats=True,
                                  check_ep_items_remaining=True)

        # Verify recovery Type succeeded if we added-back nodes
        self.verify_for_recovery_type(chosen, self.server_map, self.buckets,
                                      recoveryTypeMap, fileMapsForVerification,
                                      self.deltaRecoveryBuckets)

        # Comparison of all data if required
        if not self.withMutationOps:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set,
                                   self.servers,
                                   self.buckets,
                                   path=None,
                                   addedItems=None)

        # Verify if vbucket sequence numbers and failover logs are as expected
        # We will check only for version  > 2.5.* and if the failover is graceful
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats,
                                                            self.servers,
                                                            self.buckets,
                                                            perNode=False)
            new_failover_stats = self.compare_failovers_logs(
                prev_failover_stats, self.servers, self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats,
                                                 new_failover_stats)

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers=nodes,
                                          buckets=self.buckets,
                                          std=20.0,
                                          total_vbuckets=self.total_vbuckets)

        self.log.info("End VERIFICATION for Add-back and rebalance")

    def print_test_params(self, failover_reason):
        """ Method to print test parameters """
        self.log.info("num_replicas : {0}".format(self.num_replicas))
        self.log.info("recoveryType : {0}".format(self.recoveryType))
        self.log.info("failover_reason : {0}".format(failover_reason))
        self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes))
        self.log.info('picking server : {0} as the master'.format(self.master))

    def run_failover_operations(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        graceful_count = 0
        graceful_failover = True
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable = True
                self.stop_server(node)
                self.log.info(
                    "10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(
                    RestHelper(self.rest).wait_for_node_status(
                        node, "unhealthy", self.wait_timeout * 10),
                    msg=
                    "node status is not unhealthy even after waiting for 5 minutes"
                )
            elif failover_reason == "firewall":
                unreachable = True
                self.filter_list.append(node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(
                    server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(
                    node, "unhealthy", self.wait_timeout * 10)
                if status:
                    self.log.info(
                        "node {0}:{1} is 'unhealthy' as expected".format(
                            node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command(
                                    "netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command(
                                    "/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail(
                        "node status is not unhealthy even after waiting for 5 minutes"
                    )
            # verify the failover type
            if self.check_verify_failover_type:
                graceful_count, graceful_failover = self.verify_failover_type(
                    node, graceful_count, self.num_replicas, unreachable)
            # define precondition check for failover
            success_failed_over = self.rest.fail_over(
                node.id, graceful=(self.graceful and graceful_failover))
            if self.graceful and graceful_failover:
                if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes:
                    self.victim_node_operations(node)
                    # Start Graceful Again
                    self.log.info(" Start Graceful Failover Again !")
                    self.sleep(60)
                    success_failed_over = self.rest.fail_over(
                        node.id,
                        graceful=(self.graceful and graceful_failover))
                    msg = "graceful failover failed for nodes {0}".format(
                        node.id)
                    self.assertTrue(
                        self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
                else:
                    msg = "rebalance failed while removing failover nodes {0}".format(
                        node.id)
                    self.assertTrue(
                        self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
            failed_over = failed_over and success_failed_over

        # Check for negative cases
        if self.graceful and (failover_reason in ['stop_server', 'firewall']):
            if failed_over:
                # MB-10479
                self.rest.print_UI_logs()
            self.assertFalse(
                failed_over,
                "Graceful Falover was started for unhealthy node!!! ")
            return
        elif self.gracefulFailoverFail and not failed_over:
            """ Check if the fail_over fails as expected """
            self.assertFalse(
                failed_over,
                """ Graceful failover should fail due to not enough replicas """
            )
            return

        # Check if failover happened as expected or re-try one more time
        if not failed_over:
            self.log.info(
                "unable to failover the node the first time. try again in  60 seconds.."
            )
            # try again in 75 seconds
            self.sleep(75)
            failed_over = self.rest.fail_over(node.id,
                                              graceful=(self.graceful
                                                        and graceful_failover))
        if self.graceful and (failover_reason
                              not in ['stop_server', 'firewall']):
            reached = RestHelper(self.rest).rebalance_reached()
            self.assertTrue(
                reached,
                "rebalance failed for Graceful Failover, stuck or did not completed"
            )

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.filter_servers(self.servers, chosen)
            self.vb_distribution_analysis(servers=nodes,
                                          buckets=self.buckets,
                                          std=20.0,
                                          total_vbuckets=self.total_vbuckets,
                                          type="failover",
                                          graceful=(self.graceful
                                                    and graceful_failover))

    def run_failover_operations_with_ops(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable = True
                self.stop_server(node)
                self.log.info(
                    "10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(
                    RestHelper(self.rest).wait_for_node_status(
                        node, "unhealthy", 300),
                    msg=
                    "node status is not unhealthy even after waiting for 5 minutes"
                )
            elif failover_reason == "firewall":
                unreachable = True
                self.filter_list.append(node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(
                    server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(
                    node, "unhealthy", 300)
                if status:
                    self.log.info(
                        "node {0}:{1} is 'unhealthy' as expected".format(
                            node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command(
                                    "netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command(
                                    "/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail(
                        "node status is not unhealthy even after waiting for 5 minutes"
                    )
        nodes = self.filter_servers(self.servers, chosen)
        failed_over = self.cluster.async_failover([self.master],
                                                  failover_nodes=chosen,
                                                  graceful=self.graceful)
        # Perform Compaction
        compact_tasks = []
        if self.compact:
            for bucket in self.buckets:
                compact_tasks.append(
                    self.cluster.async_compact_bucket(self.master, bucket))
        # Run View Operations
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)
        # Run mutation operations
        if self.withMutationOps:
            self.run_mutation_operations()
        failed_over.result()
        for task in compact_tasks:
            task.result()
        msg = "rebalance failed while removing failover nodes {0}".format(
            node.id)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

    def load_initial_data(self):
        """ Method to run operations Update/Delete/Create """
        # Load All Buckets if num_items > 0
        tasks = []
        tasks += self._async_load_all_buckets(self.master,
                                              self.gen_initial_create,
                                              "create",
                                              0,
                                              flag=2,
                                              batch_size=20000)
        for task in tasks:
            task.result()
        self._wait_for_stats_all_buckets(self.servers,
                                         check_ep_items_remaining=True)
        self._verify_stats_all_buckets(self.servers, timeout=120)

    def run_mutation_operations(self):
        mutation_ops_tasks = []
        if "create" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_create, "create", 0)
        if "update" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_update, "update", 0)
        if "delete" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_delete, "delete", 0)
        try:
            for task in mutation_ops_tasks:
                task.result()
        except Exception, ex:
            self.log.info(ex)
Ejemplo n.º 45
0
class FailoverTests(FailoverBaseTest):
    def setUp(self):
        super(FailoverTests, self).setUp(self)

    def tearDown(self):
        super(FailoverTests, self).tearDown(self)

    def test_failover_firewall(self):
        self.common_test_body('firewall')

    def test_failover_normal(self):
        self.common_test_body('normal')

    def test_failover_stop_server(self):
        self.common_test_body('stop_server')

    def test_failover_then_add_back(self):
        self.add_back_flag = True
        self.common_test_body('normal')

    def common_test_body(self, failover_reason):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case (before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARDFAILOVER/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance
            5. Verify all expected operations completed by checking stats, replicaiton, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.referenceNode = self.master
        if self.failoverMaster:
            self.referenceNode = self.servers[1]
        self.log.info(" Picking node {0} as reference node for test case".format(self.referenceNode.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.referenceNode)
        self.nodes = self.rest.node_statuses()

        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)):
            self.log.error("Graceful failover can't be applied to nodes with version less then 3.*")
            self.log.error("Please check configuration parameters: SKIPPING TEST.")
            return

        # Find nodes that will under go failover
        self.chosen = RebalanceHelper.pick_nodes(self.referenceNode, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withOps = True => Run Operations in parallel to failover
        # self.withOps = False => Run Operations Before failover
        self.ops_tasks = self.run_operation_tasks()

        # Perform View Creation Tasks and check for completion if required before failover
        if self.runViews:
            self.run_view_creation_operations(self.servers)
            if not self.runViewsDuringFailover:
                self.run_view_creation_operations(self.servers)
                self.monitor_view_tasks(self.servers)

        # Take snap-shot of data set used for validaiton
        record_static_data_set = self.get_data_set_all(self.servers, self.buckets, path = None)
        prev_vbucket_stats = {}
        prev_failover_stats = {}

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.get_vbucket_seqnos(self.servers, self.buckets)
            prev_failover_stats = self.get_failovers_logs(self.servers, self.buckets)

        # Perform Operations relalted to failover
        self.run_failover_operations(self.chosen, failover_reason)

        # Perform Add Back Operation with Rebalance Or only Rebalance with Verificaitons
        if not self.gracefulFailoverFail:
            if self.add_back_flag:
                self.run_add_back_operation_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats)
            else:
                self.run_rebalance_after_failover_and_verify(self.chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats)

    def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """ Method to run rebalance after failover and verify """
        # Need a delay > min because MB-7168
        self.sleep(60, "after failover before invoking rebalance...")
        _servers_ = self.filter_servers(self.servers, chosen)
        # Rebalance after Failover operation
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                               ejectedNodes=[node.id for node in chosen])
        if self.during_ops:
            self.sleep(5, "Wait for some progress in rebalance")
            if self.during_ops == "change_password":
                old_pass = self.referenceNode.rest_password
                self.change_password(new_password=self.input.param("new_password", "new_pass"))
                self.rest = RestConnection(self.referenceNode)
            elif self.during_ops == "change_port":
                self.change_port(new_port=self.input.param("new_port", "9090"))
                self.rest = RestConnection(self.referenceNode)
        try:
            # Run operations if required during rebalance after failover
            if self.withOps:
                for task in self.ops_tasks:
                    task.result()
            msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen])
            self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

            #  Drain Queue and make sure intra-cluster replication is complete
            self._verify_stats_all_buckets(_servers_,timeout = 120)
            self._wait_for_stats_all_buckets(_servers_)

            self.log.info("Begin VERIFICATION for Rebalance after Failover Only")
            # Verify all data set with meta data if failover happens after failover
            if not self.withOps:
                self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path = None)
            # Check Cluster Stats and Data as well if max_verify > 0
            self.verify_cluster_stats(_servers_, self.referenceNode)
            # If views were created they can be verified
            if self.runViews:
                if self.runViewsDuringFailover:
                    self.monitor_view_tasks(_servers_)
                self.verify_query_task()
            # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed
            # Currently, only  for checking case where we  have graceful failover
            if self.version_greater_than_2_5 and self.graceful and self.upr_check:
                new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets)
                new_vbucket_stats =  self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets)
                self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)
            self.log.info("End VERIFICATION for Rebalance after Failover Only")
        finally:
            if self.during_ops:
                if self.during_ops == "change_password":
                    self.change_password(new_password=old_pass)
                elif self.during_ops == "change_port":
                    self.change_port(new_port='8091',
                    current_port=self.input.param("new_port", "9090"))

    def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """
            Method to run add-back operation with recovery type = (delta/full)
            It also verifies if the operations are correct with data verificaiton steps
        """
        serverMap =  self.get_server_map(self.servers)
        recoveryTypeMap = self.define_maps_during_failover(self.recoveryType)
        fileMapsForVerification = self.create_file(chosen, self.buckets, serverMap)
        index = 0
        for node in chosen:
            self.rest.add_back_node(node.id)
            self.sleep(5)
            if self.recoveryType:
                # define precondition for recoverytype
                self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index])
                index += 1
        self.sleep(20, "After failover before invoking rebalance...")
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                               ejectedNodes=[])
        msg = "rebalance failed while removing failover nodes {0}".format(chosen)
        # Run operations if required during rebalance after failover
        if self.withOps:
            for task in self.ops_tasks:
                task.result()
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        #  Drain ep_queue and make sure that intra-cluster replication is complete
        self._verify_stats_all_buckets(self.servers,timeout = 120)
        self._wait_for_stats_all_buckets(self.servers)

        self.log.info("Begin VERIFICATION for Add-back and rebalance")

        # Verify recovery Type succeeded if we added-back nodes
        self.verify_for_recovery_type(chosen, serverMap, self.buckets,
                recoveryTypeMap, fileMapsForVerification)

        # Comparison of all data if required
        if not self.withOps:
            self.data_analysis_all(record_static_data_set,self.servers, self.buckets,  path = None)

        # Verify Stats of cluster and Data is max_verify > 0
        self.verify_cluster_stats(self.servers, self.referenceNode)

        # Verify if vbucket sequence numbers and failover logs are as expected
        # We will check only for version  > 2.5.* and if the failover is graceful
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets,perNode= False)
            new_failover_stats = self.compare_failovers_logs(prev_failover_stats,self.servers,self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)

        # Peform View Validation if Supported
        if self.runViews:
            if self.runViewsDuringFailover:
                self.monitor_view_tasks(self.servers)
            self.verify_query_task()
        self.log.info("End VERIFICATION for Add-back and rebalance")

    def print_test_params(self, failover_reason):
        """ Method to print test parameters """
        self.log.info("num_replicas : {0}".format(self.num_replicas))
        self.log.info("recoveryType : {0}".format(self.recoveryType))
        self.log.info("failover_reason : {0}".format(failover_reason))
        self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes))
        self.log.info('picking server : {0} as the master'.format(self.referenceNode))

    def run_failover_operations(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        for node in chosen:
            if failover_reason == 'stop_server':
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

        # define precondition check for failover
        failed_over = self.rest.fail_over(node.id, graceful=self.graceful)

        # Check for negative cases
        if self.graceful and (failover_reason in ['stop_server', 'firewall']):
            if failed_over:
                # MB-10479
                self.rest.print_UI_logs()
            self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ")
            return
        elif self.gracefulFailoverFail and failed_over:
            """ Check if the fail_over fails as expected """
            self.assertTrue(not failed_over,""" Graceful failover should fail due to not enough replicas """)
            return

        # Check if failover happened as expected or re-try one more time
        if not failed_over:
            self.log.info("unable to failover the node the first time. try again in  60 seconds..")
            # try again in 75 seconds
            self.sleep(75)
            failed_over = self.rest.fail_over(node.id, graceful=self.graceful)
        if self.graceful and (failover_reason not in ['stop_server', 'firewall']):
            reached = RestHelper(self.rest).rebalance_reached()
            self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed")

    def run_operation_tasks(self):
        """ Method to run operations Update/Delete/Create """
        # Load All Buckets if num_items > 0
        tasks =  []
        tasks += self._async_load_all_buckets(self.referenceNode, self.gen_initial_create, "create", 0)
        for task in tasks:
            task.result()
        self._verify_stats_all_buckets(self.servers,timeout = 120)
        self._wait_for_stats_all_buckets(self.servers)
        # Update or Delete buckets if items > 0 and options are passed in tests
        # These can run in parallel (withOps = True), or before (withOps = True)
        ops_tasks = []
        if("create" in self.doc_ops):
            ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_update, "create", 0)
        if("update" in self.doc_ops):
            ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_update, "update", 0)
        if("delete" in self.doc_ops):
            ops_tasks += self._async_load_all_buckets(self.referenceNode, self.gen_delete, "delete", 0)
        if not self.withOps:
            for task in ops_tasks:
                task.result()
            self._wait_for_stats_all_buckets(self.servers)
            self._verify_stats_all_buckets(self.servers,timeout = 120)
        return ops_tasks

    def define_maps_during_failover(self, recoveryType = []):
        """ Method to define nope ip, recovery type map """
        recoveryTypeMap={}
        index=0
        for server in self.chosen:
            if recoveryType:
                recoveryTypeMap[server.ip] = recoveryType[index]
            index += 1
        return recoveryTypeMap

    def filter_servers(self, original_servers, filter_servers):
        """ Filter servers that have not failed over """
        _servers_ = copy.deepcopy(original_servers)
        for failed in filter_servers:
            for server in _servers_:
                if server.ip == failed.ip:
                    _servers_.remove(server)
                    self._cleanup_nodes.append(server)
        return _servers_

    def verify_for_recovery_type(self, chosen = [], serverMap = {}, buckets = [], recoveryTypeMap = {}, fileMap = {}):
        """ Verify recovery type is delta or full """
        logic = True
        summary = ""
        for server in self.chosen:
            shell = RemoteMachineShellConnection(serverMap[server.ip])
            for bucket in buckets:
                path = fileMap[server.ip][bucket.name]
                exists = shell.file_exists(path,"check.txt")
                if recoveryTypeMap[server.ip] == "delta" and not exists:
                    logic = False
                    summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip,bucket.name)
                elif recoveryTypeMap[server.ip] == "full" and exists:
                    logic = False
                    summary += "\n Failed Condition :: node {0}, bucket {1}  :: Expected Full, Actual Delta".format(server.ip,bucket.name)
            shell.disconnect()
        self.assertTrue(logic, summary)

    def run_view_creation_operations(self, servers):
        """" Run view Creation and indexing building tasks on servers """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        num_tries = self.input.param("num_tries", 10)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]

        query = {}
        query["connectionTimeout"] = 60000;
        query["full_set"] = "true"

        views = []
        tasks = []
        for bucket in self.buckets:
            temp = self.make_default_views(self.default_view_name, num_views,
                                           is_dev_ddoc, different_map= False)
            temp_tasks = self.async_create_views(self.master, ddoc_name, temp, bucket)
            views += temp
            tasks += temp_tasks

        timeout = max(self.wait_timeout * 4, len(self.buckets) * self.wait_timeout * self.num_items / 50000)

        for task in tasks:
            task.result(self.wait_timeout * 20)

        for bucket in self.buckets:
                for view in views:
                    # run queries to create indexes
                    self.cluster.query_view(self.master, prefix + ddoc_name, view.name, query)
        self.verify_query_task()
        active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False)
        for active_task in active_tasks:
            result = active_task.result()
            self.assertTrue(result)

    def monitor_view_tasks(self, servers):
        """ Monitor Query Tasks for their completion """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]

        active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False)
        for active_task in active_tasks:
            result = active_task.result()
            self.assertTrue(result)

    def verify_query_task(self):
        """ Verify Query Results """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]
        query = {}
        query["connectionTimeout"] = 60000;
        query["full_set"] = "true"
        expected_rows = None
        if self.max_verify:
            expected_rows = self.max_verify
            query["limit"] = expected_rows
        query["stale"] = "false"
        for bucket in self.buckets:
            self.perform_verify_queries(num_views, prefix, ddoc_name, query, bucket=bucket, wait_time=2400, expected_rows=expected_rows)

    def create_file(self,chosen,buckets,serverMap):
        """ Created files in data paths for checking if delta/full recovery occured """
        fileMap={}
        for server in self.chosen:
            shell = RemoteMachineShellConnection(serverMap[server.ip])
            map = {}
            for bucket in buckets:
                bucket_data_path=self.data_path+"/"+bucket.name+"/"+"check.txt"
                full_path=self.data_path+"/"+bucket.name+"/"
                map[bucket.name] = full_path
                shell.create_file(bucket_data_path,"check")
            fileMap[server.ip] = map
            shell.disconnect()
        return fileMap

    def get_server_map(self,node):
        """ Map of ips and server information """
        map = {}
        for server in self.servers:
            map[server.ip] = server
        return map

    def stop_server(self, node):
        """ Method to stop a server which is subject to failover """
        for server in self.servers:
            if server.ip == node.ip:
                shell = RemoteMachineShellConnection(server)
                if shell.is_couchbase_installed():
                    shell.stop_couchbase()
                    self.log.info("Couchbase stopped")
                else:
                    shell.stop_membase()
                    self.log.info("Membase stopped")
                shell.disconnect()
                break
Ejemplo n.º 46
0
class FailoverTests(FailoverBaseTest):
    def setUp(self):
        super(FailoverTests, self).setUp()
        self.server_map = self.get_server_map(self.servers)

    def tearDown(self):
        super(FailoverTests, self).tearDown()

    def test_failover_firewall(self):
        self.common_test_body('firewall')

    def test_failover_normal(self):
        self.common_test_body('normal')

    def test_failover_stop_server(self):
        self.common_test_body('stop_server')

    def test_failover_then_add_back(self):
        self.add_back_flag = True
        self.common_test_body('normal')

    def common_test_body(self, failover_reason):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case(before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARD/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance
            5. Verify all expected operations completed by checking stats,
               replication, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.filter_list = []
        if self.failoverMaster:
            self.master = self.servers[1]
        self.log.info("Picking node {0} as reference node for test case"
                      .format(self.master.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.master)
        self.nodes = self.rest.node_statuses()
        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 and (self.graceful or (self.recoveryType != None)):
            self.log.error("Graceful failover can't be applied to nodes with version less then 3.*")
            self.log.error("Please check configuration parameters: SKIPPING TEST.")
            return

        # Find nodes that will under go failover
        if self.failoverMaster:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=1, target_node=self.servers[0])
        else:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withMutationOps = True => Run Operations in parallel to failover
        # self.withMutationOps = False => Run Operations Before failover
        self.load_initial_data()
        if not self.withMutationOps:
            self.run_mutation_operations()
        # Perform view creation tasks and wait for completion before failover
        if self.withViewsOps:
            self.run_view_creation_operations(self.servers)
            if not self.createIndexesDuringFailover:
                self.query_and_monitor_view_tasks(self.servers)

        # Validate seq_no snap_start/stop values
        self.check_snap_start_corruption()

        # Take snap-shot of data set used for validaiton
        record_static_data_set = dict()
        prev_vbucket_stats = dict()
        prev_failover_stats = dict()
        if not self.withMutationOps:
            record_static_data_set = self.get_data_set_all(
                self.servers, self.buckets, path=None)

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.get_vbucket_seqnos(self.servers,
                                                         self.buckets)
            prev_failover_stats = self.get_failovers_logs(self.servers,
                                                          self.buckets)

        # Perform Operations related to failover
        if self.withMutationOps or self.withViewsOps or self.compact:
            self.run_failover_operations_with_ops(self.chosen, failover_reason)
        else:
            self.run_failover_operations(self.chosen, failover_reason)

        # TODO: Enable this even when 'flusher_total_batch_limit' is not set
        if self.flusher_total_batch_limit and \
                self.num_replicas >= self.num_failed_nodes:
            tasks = self._async_load_all_buckets(
                self.master, self.gen_update, "update", 0)
            for task in tasks:
                task.result()

        if self.graceful:
            # Validate seq_no snap_start/stop values
            self.check_snap_start_corruption()

        # Add back + rebalance // only rebalance with verification
        if not self.gracefulFailoverFail and self.runRebalanceAfterFailover:
            if self.add_back_flag:
                self.run_add_back_operation_and_verify(
                    self.chosen, prev_vbucket_stats, record_static_data_set,
                    prev_failover_stats)
            else:
                self.run_rebalance_after_failover_and_verify(
                    self.chosen, prev_vbucket_stats, record_static_data_set,
                    prev_failover_stats)

        if self.graceful:
            # Validate seq_no snap_start/stop values
            self.check_snap_start_corruption()

        if self.during_ops is None:
            self.verify_unacked_bytes_all_buckets(filter_list=self.filter_list,
                                                  master_node=self.master)

    def run_rebalance_after_failover_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """ Method to run rebalance after failover and verify """
        # Need a delay > min because MB-7168
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True)
        self.sleep(5, "after failover before invoking rebalance...")
        # Rebalance after Failover operation
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen])
        if self.during_ops:
            self.sleep(5, "Wait for some progress in rebalance")
            if self.during_ops == "change_password":
                old_pass = self.master.rest_password
                self.change_password(new_password=self.input.param("new_password", "new_pass"))
                self.rest = RestConnection(self.master)
            elif self.during_ops == "change_port":
                self.change_port(new_port=self.input.param("new_port", "9090"))
                self.rest = RestConnection(self.master)
        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master, bucket)
        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers, chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node=chosen[0])
            self.sleep(60)
            self.log.info(" Start Rebalance Again !")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes], ejectedNodes=[node.id for node in chosen])

        # Rebalance Monitoring
        msg = "rebalance failed while removing failover nodes {0}".format([node.id for node in chosen])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        # Reset password or port
        if self.during_ops:
            if self.during_ops == "change_password":
                self.change_password(new_password=old_pass)
            elif self.during_ops == "change_port":
                self.change_port(new_port='8091',
                current_port=self.input.param("new_port", "9090"))
            return

        #  Drain Queue and make sure intra-cluster replication is complete
        self.log.info("Begin VERIFICATION for Rebalance after Failover Only")
        self.verify_cluster_stats(_servers_, self.master, check_bucket_stats=True, check_ep_items_remaining=True)
        # Verify all data set with meta data if failover happens after failover
        if not self.withMutationOps:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set, _servers_, self.buckets, path=None, addedItems=None)

        # Check Cluster Stats and Data as well if max_verify > 0
        # Check Failover logs :: Not sure about this logic, currently not checking, will update code once confirmed
        # Currently, only  for checking case where we  have graceful failover
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_failover_stats = self.compare_failovers_logs(prev_failover_stats, _servers_, self.buckets)
            new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, _servers_, self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)
        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets)
        self.log.info("End VERIFICATION for Rebalance after Failover Only")

    def run_add_back_operation_and_verify(self, chosen, prev_vbucket_stats, record_static_data_set, prev_failover_stats):
        """
            Method to run add-back operation with recovery type = (delta/full)
            It also verifies if the operations are correct with data verificaiton steps
        """
        _servers_ = self.filter_servers(self.servers, chosen)
        self._wait_for_stats_all_buckets(_servers_, check_ep_items_remaining=True)
        recoveryTypeMap = self.define_maps_during_failover(self.recoveryType)
        fileMapsForVerification = self.create_file(chosen, self.buckets, self.server_map)
        index = 0
        for node in chosen:
            self.sleep(5)
            if self.recoveryType:
                # define precondition for recoverytype
                self.rest.set_recovery_type(otpNode=node.id, recoveryType=self.recoveryType[index])
                index += 1
            else:
                self.rest.add_back_node(node.id)

        # Doc_mutation before triggering rebalance
        if self.flusher_total_batch_limit and \
                self.num_replicas >= self.num_failed_nodes:
            tasks = self._async_load_all_buckets(
                self.master, self.gen_update, "update", 0)
            for task in tasks:
                task.result()

        self.sleep(20, "After failover before invoking rebalance...")
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[],
                            deltaRecoveryBuckets=self.deltaRecoveryBuckets)

        # Perform Compaction
        if self.compact:
            for bucket in self.buckets:
                self.cluster.compact_bucket(self.master, bucket)

        # Peform View Validation if Supported
        nodes = self.filter_servers(self.servers, chosen)
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)

        # Run operations if required during rebalance after failover
        if self.withMutationOps:
            self.run_mutation_operations_after_failover()

        # Kill or restart operations
        if self.killNodes or self.stopNodes or self.firewallOnNodes:
            self.victim_node_operations(node=chosen[0])
            self.sleep(60)
            self.log.info("Start Rebalance Again!")
            self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                                ejectedNodes=[],
                                deltaRecoveryBuckets=self.deltaRecoveryBuckets)
            self.sleep(10, "Wait for rebalance to start")

        # Check if node has to be killed or restarted during rebalance
        # Monitor Rebalance
        msg = "rebalance failed while removing failover nodes {0}".format(chosen)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)

        # Drain ep_queue and make sure that intra-cluster replication is complete
        self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True)

        self.log.info("Begin VERIFICATION for Add-back and rebalance")

        # Verify Stats of cluster and Data is max_verify > 0
        self.verify_cluster_stats(self.servers, self.master, check_bucket_stats=True, check_ep_items_remaining=True)

        # Verify recovery Type succeeded if we added-back nodes
        self.verify_for_recovery_type(chosen, self.server_map, self.buckets, recoveryTypeMap, fileMapsForVerification, self.deltaRecoveryBuckets)

        # Comparison of all data if required
        if not self.withMutationOps and self.flusher_total_batch_limit is None:
            self.sleep(60)
            self.data_analysis_all(record_static_data_set, self.servers, self.buckets, path=None, addedItems=None)

        # Verify if vbucket sequence numbers and failover logs are as expected
        # We will check only for version  > 2.5.* and if the failover is graceful
        if self.version_greater_than_2_5 and self.graceful and self.upr_check:
            new_vbucket_stats = self.compare_vbucket_seqnos(prev_vbucket_stats, self.servers, self.buckets, perNode=False)
            new_failover_stats = self.compare_failovers_logs(prev_failover_stats, self.servers, self.buckets)
            self.compare_vbucketseq_failoverlogs(new_vbucket_stats, new_failover_stats)

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.get_nodes_in_cluster(self.master)
            self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets)

        self.log.info("End VERIFICATION for Add-back and rebalance")

    def print_test_params(self, failover_reason):
        """ Method to print test parameters """
        self.log.info("num_replicas : {0}".format(self.num_replicas))
        self.log.info("recoveryType : {0}".format(self.recoveryType))
        self.log.info("failover_reason : {0}".format(failover_reason))
        self.log.info("num_failed_nodes : {0}".format(self.num_failed_nodes))
        self.log.info('picking server : {0} as the master'.format(self.master))

    def run_failover_operations(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        graceful_count = 0
        graceful_failover = True
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable = True
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                unreachable = True
                self.filter_list.append (node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", self.wait_timeout * 10)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")
            # verify the failover type
            if self.check_verify_failover_type:
                graceful_count, graceful_failover = self.verify_failover_type(node, graceful_count, self.num_replicas, unreachable)
            # define precondition check for failover
            success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
            if self.graceful and graceful_failover:
                if self.stopGracefulFailover or self.killNodes or self.stopNodes or self.firewallOnNodes:
                    self.victim_node_operations(node)
                    # Start Graceful Again
                    self.log.info(" Start Graceful Failover Again !")
                    self.sleep(120)
                    success_failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
                    self.sleep(180)
                    msg = "graceful failover failed for nodes {0}".format(node.id)
                    self.log.info("chosen: {0} get_failover_count: {1}".format(len(chosen),
                                                                               self.get_failover_count()))
                    self.assertEqual(len(chosen), self.get_failover_count(), msg=msg)
                else:
                    msg = "rebalance failed while removing failover nodes {0}".format(node.id)
                    self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)
            failed_over = failed_over and success_failed_over

        # Check for negative cases
        if self.graceful and (failover_reason in ['stop_server', 'firewall']):
            if failed_over:
                # MB-10479
                self.rest.print_UI_logs()
            self.assertFalse(failed_over, "Graceful Falover was started for unhealthy node!!! ")
            return
        elif self.gracefulFailoverFail and not failed_over:
            """ Check if the fail_over fails as expected """
            self.assertFalse(failed_over, """ Graceful failover should fail due to not enough replicas """)
            return

        # Check if failover happened as expected or re-try one more time
        if not failed_over:
            self.log.info("unable to failover the node the first time. try again in  60 seconds..")
            # try again in 75 seconds
            self.sleep(75)
            failed_over = self.rest.fail_over(node.id, graceful=(self.graceful and graceful_failover))
        if self.graceful and (failover_reason not in ['stop_server', 'firewall']):
            reached = RestHelper(self.rest).rebalance_reached()
            self.assertTrue(reached, "rebalance failed for Graceful Failover, stuck or did not completed")

        # Verify Active and Replica Bucket Count
        if self.num_replicas > 0:
            nodes = self.filter_servers(self.servers, chosen)
            self.vb_distribution_analysis(servers=nodes, buckets=self.buckets, std=20.0 , total_vbuckets=self.total_vbuckets, type="failover", graceful=(self.graceful and graceful_failover))

    def run_failover_operations_with_ops(self, chosen, failover_reason):
        """ Method to run fail over operations used in the test scenario based on failover reason """
        # Perform Operations relalted to failover
        failed_over = True
        for node in chosen:
            unreachable = False
            if failover_reason == 'stop_server':
                unreachable = True
                self.stop_server(node)
                self.log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                unreachable = True
                self.filter_list.append (node.ip)
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(self.rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    self.log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                                shell.log_command_output(o, r)
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                                shell.log_command_output(o, r)
                            shell.disconnect()
                    self.rest.print_UI_logs()
                    api = self.rest.baseUrl + 'nodeStatuses'
                    status, content, header = self.rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")
        nodes = self.filter_servers(self.servers, chosen)
        failed_over = self.cluster.async_failover([self.master], failover_nodes=chosen, graceful=self.graceful)
        # Perform Compaction
        compact_tasks = []
        if self.compact:
            for bucket in self.buckets:
                compact_tasks.append(self.cluster.async_compact_bucket(self.master, bucket))
        # Run View Operations
        if self.withViewsOps:
            self.query_and_monitor_view_tasks(nodes)
        # Run mutation operations
        if self.withMutationOps:
            self.run_mutation_operations()
        failed_over.result()
        for task in compact_tasks:
            task.result()
        msg = "rebalance failed while removing failover nodes {0}".format(node.id)
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True), msg=msg)


    def load_initial_data(self):
        """ Method to run operations Update/Delete/Create """
        # Load All Buckets if num_items > 0
        tasks = []
        tasks += self._async_load_all_buckets(self.master, self.gen_initial_create, "create", 0, flag=2, batch_size=20000)
        for task in tasks:
            task.result()
        self._wait_for_stats_all_buckets(self.servers, check_ep_items_remaining=True)
        self._verify_stats_all_buckets(self.servers, timeout=120)

    def run_mutation_operations(self):
        mutation_ops_tasks = []
        if "create" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_create, "create", 0)
        if "update" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_update, "update", 0)
        if "delete" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.gen_delete, "delete", 0)
        try:
            for task in mutation_ops_tasks:
                task.result()
        except Exception as ex:
            self.log.info(ex)

    def run_mutation_operations_after_failover(self):
        mutation_ops_tasks = []
        if "create" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.afterfailover_gen_create, "create", 0)
        if "update" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.afterfailover_gen_update, "update", 0)
        if "delete" in self.doc_ops:
            mutation_ops_tasks += self._async_load_all_buckets(
                self.master, self.afterfailover_gen_delete, "delete", 0)
        try:
            for task in mutation_ops_tasks:
                task.result()
        except Exception as ex:
            self.log.info(ex)

    def define_maps_during_failover(self, recoveryType=[]):
        """ Method to define nope ip, recovery type map """
        recoveryTypeMap = {}
        index = 0
        for server in self.chosen:
            if recoveryType:
                recoveryTypeMap[server.ip] = recoveryType[index]
            index += 1
        return recoveryTypeMap

    def filter_servers(self, original_servers, filter_servers):
        """ Filter servers that have not failed over """
        _servers_ = copy.deepcopy(original_servers)
        for failed in filter_servers:
            for server in _servers_:
                if server.ip == failed.ip:
                    _servers_.remove(server)
                    self._cleanup_nodes.append(server)
        return _servers_

    def verify_for_recovery_type(self, chosen=[], serverMap={}, buckets=[], recoveryTypeMap={}, fileMap={}, deltaRecoveryBuckets=[]):
        """ Verify recovery type is delta or full """
        summary = ""
        logic = True
        for server in self.chosen:
            shell = RemoteMachineShellConnection(serverMap[server.ip])
            os_type = shell.extract_remote_info()
            if os_type.type.lower() == 'windows':
                return
            for bucket in buckets:
                path = fileMap[server.ip][bucket.name]
                exists = shell.file_exists(path, "check.txt")
                if deltaRecoveryBuckets != None:
                    if recoveryTypeMap[server.ip] == "delta" and (bucket.name in deltaRecoveryBuckets) and not exists:
                        logic = False
                        summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip, bucket.name)
                    elif recoveryTypeMap[server.ip] == "delta" and (bucket.name not in deltaRecoveryBuckets) and exists:
                        summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Full, Actual Delta".format(server.ip, bucket.name)
                        logic = False
                else:
                    if recoveryTypeMap[server.ip] == "delta"  and not exists:
                        logic = False
                        summary += "\n Failed Condition :: node {0}, bucket {1} :: Expected Delta, Actual Full".format(server.ip, bucket.name)
                    elif recoveryTypeMap[server.ip] == "full" and exists:
                        logic = False
                        summary += "\n Failed Condition :: node {0}, bucket {1}  :: Expected Full, Actual Delta".format(server.ip, bucket.name)
            shell.disconnect()
        self.assertTrue(logic, summary)

    def run_view_creation_operations(self, servers):
        """" Run view Creation and indexing building tasks on servers """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        num_tries = self.input.param("num_tries", 10)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]

        query = {}
        query["connectionTimeout"] = 60000
        query["full_set"] = "true"

        views = []
        tasks = []
        for bucket in self.buckets:
            temp = self.make_default_views(self.default_view_name, num_views,
                                           is_dev_ddoc, different_map=False)
            temp_tasks = self.async_create_views(self.master, ddoc_name, temp, bucket)
            views += temp
            tasks += temp_tasks

        timeout = max(self.wait_timeout * 4, len(self.buckets) * self.wait_timeout * self.num_items // 50000)

        for task in tasks:
            task.result(self.wait_timeout * 20)

    def query_and_monitor_view_tasks(self, servers):
        """ Monitor Query Tasks for their completion """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]
        self.verify_query_task()
        active_tasks = self.cluster.async_monitor_active_task(servers, "indexer", "_design/" + prefix + ddoc_name, wait_task=False)
        for active_task in active_tasks:
            result = active_task.result()
            self.assertTrue(result)

    def verify_query_task(self):
        """ Verify Query Results """
        num_views = self.input.param("num_views", 5)
        is_dev_ddoc = self.input.param("is_dev_ddoc", True)
        ddoc_name = "ddoc1"
        prefix = ("", "dev_")[is_dev_ddoc]
        query = {}
        query["connectionTimeout"] = 60000
        query["full_set"] = "true"
        expected_rows = None
        timeout = None
        if self.active_resident_threshold == 0:
            timeout = 2400
        if self.max_verify:
            expected_rows = self.max_verify
            query["limit"] = expected_rows
        query["stale"] = "false"
        for bucket in self.buckets:
            self.perform_verify_queries(num_views, prefix, ddoc_name, query, bucket=bucket, wait_time=timeout, expected_rows=expected_rows)

    def create_file(self, chosen, buckets, serverMap):
        """ Created files in data paths for checking if delta/full recovery occured """
        fileMap = {}
        for server in self.chosen:
            shell = RemoteMachineShellConnection(serverMap[server.ip])
            type = shell.extract_remote_info().distribution_type
            map = {}
            for bucket in buckets:
                if type.lower() == 'windows':
                    self.data_path = 'c:/Program\ Files/Couchbase/Server/var/lib/couchbase/data'
                bucket_data_path = self.data_path + "/" + bucket.name + "/" + "check.txt"
                full_path = self.data_path + "/" + bucket.name + "/"
                map[bucket.name] = full_path
                shell.create_file(bucket_data_path, "check")
            fileMap[server.ip] = map
            shell.disconnect()
        return fileMap

    def verify_failover_type(self, chosen=None, graceful_count=0, replica_count=0, unreachable=False):
        logic = True
        summary = ""
        nodes = self.rest.node_statuses()
        node_count = len(nodes)
        change_graceful_count = graceful_count
        graceful_failover = True
        if unreachable:
            node_count -= 1
        else:
            change_graceful_count += 1
        if replica_count != 0:
            for node in nodes:
                if unreachable and node.ip == chosen.ip:
                    graceful_failover = node.gracefulFailoverPossible
                    if node.gracefulFailoverPossible:
                        logic = False
                        summary += "\n failover type for unreachable node {0} Expected :: Hard, Actual :: Graceful".format(node.ip)
                elif node.ip == chosen.ip:
                    graceful_failover = node.gracefulFailoverPossible
                    if replica_count > graceful_count and (node_count - 1) + graceful_count >= replica_count:
                        if not node.gracefulFailoverPossible:
                            logic = False
                            summary += "\n failover type for node {0} Expected :: Graceful, Actual :: Hard".format(node.ip)
                    else:
                        if node.gracefulFailoverPossible:
                            logic = False
                            summary += "\n failover type for  {0} Expected :: Hard, Actual :: Graceful".format(node.ip)
        else:
            for node in nodes:
                if node.ip == chosen.ip:
                    graceful_failover = node.gracefulFailoverPossible
                    if node.gracefulFailoverPossible:
                        logic = False
                        summary += "\n failover type for node {0} Expected :: Hard, Actual :: Graceful".format(node.ip)
        self.assertTrue(logic, summary)
        return change_graceful_count, graceful_failover

    def get_server_map(self, node):
        """ Map of ips and server information """
        map = {}
        for server in self.servers:
            map[server.ip] = server
        return map

    def victim_node_operations(self, node=None):
        if self.stopGracefulFailover:
            self.log.info(" Stopping Graceful Failover ")
            stopped = self.rest.stop_rebalance(wait_timeout=self.wait_timeout // 3)
            self.assertTrue(stopped, msg="unable to stop rebalance")
        if self.killNodes:
            self.log.info(" Killing Memcached ")
            kill_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count)
            for kill_node in kill_nodes:
                self.kill_server_memcached(kill_node)
        if self.stopNodes:
            self.log.info(" Stopping Node")
            stop_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count)
            for stop_node in stop_nodes:
                self.stop_server(stop_node)
            self.sleep(10)
            self.log.info(" Starting Node")
            for start_node in stop_nodes:
                self.start_server(start_node)
        if self.firewallOnNodes:
            self.log.info(" Enabling Firewall for Node ")
            stop_nodes = self.get_victim_nodes(self.servers, self.master, node, self.victim_type, self.victim_count)
            for stop_node in stop_nodes:
                self.start_firewall_on_node(stop_node)
            self.sleep(30)
            self.log.info(" Disable Firewall for Node ")
            for start_node in stop_nodes:
                self.stop_firewall_on_node(start_node)
        self.sleep(120)

    def get_failover_count(self):
        rest = RestConnection(self.master)
        cluster_status = rest.cluster_status()
        failover_count = 0
        # check for inactiveFailed
        for node in cluster_status['nodes']:
            if node['clusterMembership'] == "inactiveFailed":
                failover_count += 1
        return failover_count
Ejemplo n.º 47
0
    def common_test_body(self, keys_count, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(self.num_replicas))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master, self.gen_create, "create", 0,
                               batch_size=10000, pause_secs=5, timeout_secs=180)
        self._wait_for_stats_all_buckets(self.servers)

        _servers_ = self.servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        RebalanceHelper.wait_for_replication(self.servers, self.cluster)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=self.num_replicas)
        for node in chosen:
            # let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info("10 seconds delay to wait for membase-server to shutdown")
                # wait for 5 minutes until node is down
                self.assertTrue(RestHelper(rest).wait_for_node_status(node, "unhealthy", 300),
                                    msg="node status is not unhealthy even after waiting for 5 minutes")
            elif failover_reason == "firewall":
                server = [srv for srv in self.servers if node.ip == srv.ip][0]
                RemoteUtilHelper.enable_firewall(server, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(node.ip, node.port))
                else:
                    # verify iptables on the node if something wrong
                    for server in self.servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            info = shell.extract_remote_info()
                            if info.type.lower() == "windows":
                                o, r = shell.execute_command("netsh advfirewall show allprofiles")
                            else:
                                o, r = shell.execute_command("/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    for i in rest.get_logs(): self.log.error(i)
                    api = rest.baseUrl + 'nodeStatuses'
                    status, content, header = rest._http_request(api)
                    json_parsed = json.loads(content)
                    self.log.info("nodeStatuses: {0}".format(json_parsed))
                    self.fail("node status is not unhealthy even after waiting for 5 minutes")

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info("unable to failover the node the first time. try again in  60 seconds..")
                # try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(failed_over, "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info("10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            # Need a delay > min because MB-7168
            log.info("60 seconds sleep after failover before invoking rebalance...")
            time.sleep(60)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                               ejectedNodes=[node.id for node in chosen])
            if self.during_ops:
                self.sleep(5, "Wait for some progress in rebalance")
                if self.during_ops == "change_password":
                    old_pass = self.master.rest_password
                    self.change_password(new_password=self.input.param("new_password", "new_pass"))
                    rest = RestConnection(self.master)
                elif self.during_ops == "change_port":
                    self.change_port(new_port=self.input.param("new_port", "9090"))
                    rest = RestConnection(self.master)
            try:
                msg = "rebalance failed while removing failover nodes {0}".format(chosen)
                self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
                for failed in chosen:
                    for server in _servers_:
                        if server.ip == failed.ip:
                             _servers_.remove(server)
                             self._cleanup_nodes.append(server)

                log.info("Begin VERIFICATION ...")
                RebalanceHelper.wait_for_replication(_servers_, self.cluster)
                self.verify_cluster_stats(_servers_, self.master)
            finally:
                if self.during_ops:
                     if self.during_ops == "change_password":
                         self.change_password(new_password=old_pass)
                     elif self.during_ops == "change_port":
                         self.change_port(new_port='8091',
                                          current_port=self.input.param("new_port", "9090"))