Example #1
0
    def replication_verification(master, bucket, replica, inserted_count, test):
        rest = RestConnection(master)
        nodes = rest.node_statuses()

        if len(nodes) / (1 + replica) >= 1:
                    final_replication_state = RestHelper(rest).wait_for_replication(900)
                    msg = "replication state after waiting for up to 15 minutes : {0}"
                    test.log.info(msg.format(final_replication_state))
                    # in windows, we need to set timeout_in_seconds to 15+ minutes
                    test.assertTrue(RebalanceHelper.wait_till_total_numbers_match(master=master,
                                                                                  bucket=bucket,
                                                                                  timeout_in_seconds=1200),
                                    msg="replication was completed but sum(curr_items) dont match the curr_items_total")

                    start_time = time.time()
                    stats = rest.get_bucket_stats()
                    while time.time() < (start_time + 120) and stats["curr_items"] != inserted_count:
                        test.log.info("curr_items : {0} versus {1}".format(stats["curr_items"], inserted_count))
                        time.sleep(5)
                        stats = rest.get_bucket_stats()
                    RebalanceHelper.print_taps_from_all_nodes(rest, bucket)
                    test.log.info("curr_items : {0} versus {1}".format(stats["curr_items"], inserted_count))
                    stats = rest.get_bucket_stats()
                    msg = "curr_items : {0} is not equal to actual # of keys inserted : {1}"
                    test.assertEquals(stats["curr_items"], inserted_count,
                                      msg=msg.format(stats["curr_items"], inserted_count))
Example #2
0
    def run_test(self):
        ep_threshold = self.input.param("ep_threshold", "ep_mem_low_wat")
        active_resident_threshold = int(self.input.param("active_resident_threshold", 10))

        mc = MemcachedClientHelper.direct_client(self.servers[0], self.bucket_name)
        stats = mc.stats()
        threshold = int(self.input.param("threshold", stats[ep_threshold]))
        threshold_reached = False
        self.num_items = self.input.param("items", 10000)
        self._load_doc_data_all_buckets("create")

        # load items till reached threshold or mem-ratio is less than resident ratio threshold
        while not threshold_reached:
            mem_used = int(mc.stats()["mem_used"])
            if mem_used < threshold or int(mc.stats()["vb_active_perc_mem_resident"]) >= active_resident_threshold:
                self.log.info(
                    "mem_used and vb_active_perc_mem_resident_ratio reached at %s/%s and %s "
                    % (mem_used, threshold, mc.stats()["vb_active_perc_mem_resident"])
                )
                items = self.num_items
                self.num_items += self.input.param("items", 10000)
                self._load_doc_data_all_buckets("create", items)
            else:
                threshold_reached = True
                self.log.info("DGM state achieved!!!!")

        # wait for draining of data before restart and warm up
        for bucket in self.buckets:
            RebalanceHelper.wait_for_persistence(self.nodes_server[0], bucket)

        while 1:

            #            read_data_task = self.cluster.async_verify_data(self.master, self.buckets[0], self.buckets[0].kvs[1])

            read_data_task = Thread(target=self._run_get)
            read_data_task.start()
            # 5 threads to run stats all and reset asynchronously
            start = time.time()
            while (time.time() - start) < 300:

                stats_all_thread = []
                stats_reset_thread = []

                for i in xrange(self.threads_to_run):
                    stat_str = ""
                    stats_all_thread.append(Thread(target=self._get_stats, args=[stat_str]))
                    stats_all_thread[i].start()
                    stat_str = "reset"
                    stats_reset_thread.append(Thread(target=self._get_stats, args=[stat_str]))
                    stats_reset_thread[i].start()

                for i in xrange(self.threads_to_run):
                    stats_all_thread[i].join()
                    stats_reset_thread[i].join()

                del stats_all_thread
                del stats_reset_thread

            #            read_data_task.result()
            read_data_task.join()
 def create_ddocs(self, is_dev_view):
     mapview = View(
         self.map_view_name,
         """function(doc) {
          emit(doc.integer, doc.string);
       }""",
         dev_view=is_dev_view,
     )
     self.cluster.create_view(self.master, "test", mapview)
     redview = View(
         self.red_view_name,
         """function(doc) {
          emit([doc.integer, doc.string], doc.integer);
       }""",
         """_count""",
         dev_view=is_dev_view,
     )
     self.cluster.create_view(self.master, "test", redview)
     redview_stats = View(
         self.red_view_stats_name,
         """function(doc) {
          emit(doc.string, doc.string);
       }""",
         """_stats""",
         dev_view=is_dev_view,
     )
     self.cluster.create_view(self.master, "test2", redview_stats)
     RebalanceHelper.wait_for_persistence(self.master, self.bucket, 0)
Example #4
0
    def _verify_stats_all_buckets(self, servers, timeout=60):
        stats_tasks = []
        for bucket in self.buckets:
            items = sum([len(kv_store) for kv_store in bucket.kvs.values()])
            stats_tasks.append(self.cluster.async_wait_for_stats(servers, bucket, '',
                               'curr_items', '==', items))
            stats_tasks.append(self.cluster.async_wait_for_stats(servers, bucket, '',
                               'vb_active_curr_items', '==', items))

            available_replicas = self.num_replicas
            if len(servers) == self.num_replicas:
                available_replicas = len(servers) - 1
            elif len(servers) <= self.num_replicas:
                available_replicas = len(servers) - 1

            stats_tasks.append(self.cluster.async_wait_for_stats(servers, bucket, '',
                                   'vb_replica_curr_items', '==', items * available_replicas))
            stats_tasks.append(self.cluster.async_wait_for_stats(servers, bucket, '',
                                   'curr_items_tot', '==', items * (available_replicas + 1)))
        try:
            for task in stats_tasks:
                task.result(timeout)
        except Exception as e:
            print e;
            for task in stats_tasks:
                task.cancel()
            self.log.error("unable to get expected stats for any node! Print taps for all nodes:")
            rest = RestConnection(self.master)
            for bucket in self.buckets:
                RebalanceHelper.print_taps_from_all_nodes(rest, bucket)
            raise Exception("unable to get expected stats during {0} sec".format(timeout))
 def test_views_failover(self):
     num_nodes = self.input.param('num-nodes', 1)
     ddocs =  self.make_ddocs(self.num_ddoc, self.views_per_ddoc, 0)
     RebalanceHelper.wait_for_persistence(self.master, self.bucket_name)
     self.cluster.failover(self.servers,
                           self.servers[1:num_nodes])
     self.cluster.rebalance(self.servers, [], self.servers[1:num_nodes])
     self.perform_ddoc_ops(ddocs)
Example #6
0
 def _verify_data(self, master, rest, inserted_keys):
     log = logger.Logger.get_logger()
     log.info("Verifying data")
     ready = RebalanceHelper.wait_for_stats_on_all(master, "default", "ep_queue_size", 0)
     self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
     ready = RebalanceHelper.wait_for_stats_on_all(master, "default", "ep_flusher_todo", 0)
     self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
     BucketOperationHelper.keys_exist_or_assert(keys=inserted_keys, server=master, bucket_name="default", test=self)
Example #7
0
 def verify_data(master, inserted_keys, bucket, test):
     test.log.info("Verifying data")
     ready = RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_queue_size', 0)
     test.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
     ready = RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_flusher_todo', 0)
     test.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
     BucketOperationHelper.keys_exist_or_assert_in_parallel(keys=inserted_keys, server=master, \
         bucket_name=bucket, test=test, concurrency=4)
Example #8
0
    def _failover_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings
        num_initial_servers = self.num_initial_servers
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, servers_rebalanced = RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master)))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.failover_factor)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.fail_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            optNodesIds[0] = content

        self.log.info("FAILOVER PHASE")
        # Failover selected nodes
        for node in optNodesIds:
            self.log.info("failover node {0} and rebalance afterwards".format(node))
            rest.fail_over(node)

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.failover_factor]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.fail_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()], \
            ejectedNodes=optNodesIds)

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(new_swap_servers))

        SwapRebalanceBase.verification_phase(self, master)
Example #9
0
    def _test_delete_key_and_backup_and_restore_body(self):
        bucket = "default"
        BucketOperationHelper.create_bucket(serverInfo=self.master, name=bucket, test_case=self)
        ready = BucketOperationHelper.wait_for_memcached(self.master, bucket)
        self.assertTrue(ready, "wait_for_memcached failed")

        self.add_nodes_and_rebalance()

        client = MemcachedClientHelper.direct_client(self.master, "default")
        expiry = 2400
        test_uuid = uuid.uuid4()
        keys = ["key_%s_%d" % (test_uuid, i) for i in range(500)]
        self.log.info("pushing keys with expiry set to {0}".format(expiry))
        for key in keys:
            try:
                client.set(key, expiry, 0, "1")
            except mc_bin_client.MemcachedError as error:
                msg = "unable to push key : {0} to bucket : {1} error : {2}"
                self.log.error(msg.format(key, client.vbucketId, error.status))
                self.fail(msg.format(key, client.vbucketId, error.status))
        self.log.info("inserted {0} keys with expiry set to {1}".format(len(keys), expiry))

        client.delete(keys[0])

        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")

        #let's create a unique folder in the remote location
        for server in self.servers:
            shell = RemoteMachineShellConnection(server)
            output, error = shell.execute_command(self.perm_command)
            shell.log_command_output(output, error)
            node = RestConnection(server).get_nodes_self()
            BackupHelper(server, self).backup(bucket, node, self.remote_tmp_folder)
            shell.disconnect()

        for server in self.servers:
            BackupHelper(server, self).restore(self.remote_tmp_folder)
            time.sleep(10)

        self.log.info('verifying that all those keys...')
        missing_keys = []
        verify_keys = []
        for key in keys:
            vBucketId = crc32.crc32_hash(key) & 1023  # or & 0x3FF
            client.vbucketId = vBucketId
            if key == keys[0]:
                missing_keys.append(key)
            else:
                verify_keys.append(key)

        self.assertTrue(BucketOperationHelper.keys_dont_exist(self.master, missing_keys, self),
                        "Keys are not empty")
        self.assertTrue(BucketOperationHelper.verify_data(self.master, verify_keys, False, False, 11210, self),
                        "Missing keys")
Example #10
0
 def create_ddocs(self):
     mapview = View(self.map_view_name, '''function(doc) {
          emit(doc.integer, doc.string);
       }''', dev_view=self.is_dev_view)
     self.cluster.create_view(self.master, 'test', mapview)
     redview = View(self.red_view_name, '''function(doc) {
          emit([doc.integer, doc.string], doc.integer);
       }''', '''_count''', dev_view=self.is_dev_view)
     self.cluster.create_view(self.master, 'test', redview)
     RebalanceHelper.wait_for_persistence(self.master, self.bucket, 0)
 def test_parallel_DB_views_compaction(self):
     rest = RestConnection(self.master)
     self.set_auto_compaction(rest, parallelDBAndVC="true", viewFragmntThresholdPercentage=self.fragmentation_value, dbFragmentThresholdPercentage=self.fragmentation_value)
     self.make_ddocs(self.ddocs_num, self.view_per_ddoc)
     self.create_ddocs()
     self._load_all_buckets(self.master, self.gen_load, "create", 0)
     RebalanceHelper.wait_for_persistence(self.master, self.default_bucket_name)
     self._compaction_thread()
     if self.thread_crashed.is_set():
         self.fail("Error occurred during run")
Example #12
0
 def _monitor_drain_queue(self):
     #start whenever drain_queue is > 0
     rest = RestConnection(self.master)
     start = time.time()
     stats = rest.get_bucket_stats(self.bucket)
     self.log.info("current ep_queue_size: {0}".format(stats["ep_queue_size"]))
     verified = RebalanceHelper.wait_for_stats(self.master, self.bucket, 'ep_queue_size', 0, timeout_in_seconds=300, verbose=False)\
     and RebalanceHelper.wait_for_stats(self.master, self.bucket, 'ep_flusher_todo', 0, timeout_in_seconds=300, verbose=False)
     self.drained = verified
     self.drained_in_seconds = time.time() - start
 def test_parallel_enable_DB_compaction(self):
     rest = RestConnection(self.master)
     self.set_auto_compaction(rest, parallelDBAndVC="true", dbFragmentThresholdPercentage=self.fragmentation_value)
     self.make_ddocs(self.ddocs_num, self.view_per_ddoc)
     self.create_ddocs()
     self._load_all_buckets(self.master, self.gen_load, "create", 0)
     RebalanceHelper.wait_for_persistence(self.master, self.default_bucket_name)
     self._compaction_thread()
     if self.thread_crashed.is_set():
         self.log.info("View Compaction is not started as expected")
Example #14
0
 def test_observe_with_warmup(self):
     self._load_doc_data_all_buckets('create', 0, self.num_items)
     # Persist all the loaded data item
     self.log.info("Nodes in cluster: %s" % self.servers[:self.nodes_init])
     for bucket in self.buckets:
         RebalanceHelper.wait_for_persistence(self.master, bucket)
         self._stats_befor_warmup(bucket.name)
         self._restart_memcache(bucket.name)
         # for bucket in self.buckets:
         ClusterOperationHelper._wait_warmup_completed(self, self.servers[:self.nodes_init], bucket.name)
         self._run_observe(self)
Example #15
0
    def _test_backup_and_restore_bucket_overwriting_body(self, overwrite_flag=True):
        bucket = "default"
        BucketOperationHelper.create_bucket(serverInfo=self.master, test_case=self)
        BucketOperationHelper.wait_for_memcached(self.master, bucket)
        self.add_nodes_and_rebalance()

        client = MemcachedClientHelper.direct_client(self.master, "default")
        expiry = 2400
        test_uuid = uuid.uuid4()
        keys = ["key_%s_%d" % (test_uuid, i) for i in range(500)]
        self.log.info("pushing keys with expiry set to {0}".format(expiry))
        for key in keys:
            try:
                client.set(key, expiry, 0, "1")
            except mc_bin_client.MemcachedError as error:
                msg = "unable to push key : {0} to bucket : {1} error : {2}"
                self.log.error(msg.format(key, client.vbucketId, error.status))
                self.fail(msg.format(key, client.vbucketId, error.status))
        self.log.info("inserted {0} keys with expiry set to {1}".format(len(keys), expiry))

        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")

        for server in self.servers:
            shell = RemoteMachineShellConnection(server)

            output, error = shell.execute_command(self.perm_command)
            shell.log_command_output(output, error)
            node = RestConnection(server).get_nodes_self()
            BackupHelper(server, self).backup(bucket, node, self.remote_tmp_folder)
            shell.disconnect()

        for key in keys:
            try:
                client.replace(key, expiry, 0, "2")
            except mc_bin_client.MemcachedError as error:
                msg = "unable to replace key : {0} in bucket : {1} error : {2}"
                self.log.error(msg.format(key, client.vbucketId, error.status))
                self.fail(msg.format(key, client.vbucketId, error.status))
        self.log.info("replaced {0} keys with expiry set to {1}".format(len(keys), expiry))

        for server in self.servers:
            BackupHelper(server, self).restore(self.remote_tmp_folder, overwrite_flag)
            time.sleep(10)

        self.log.info('verifying that all those keys...')
        for key in keys:
            if overwrite_flag:
                self.assertEqual("2", client.get(key=key), key + " should has value = 2")
            else:
                self.assertNotEqual("2", client.get(key=key), key + " should not has value = 2")
        self.log.info("verified that those keys inserted with expiry set to {0} have expired".format(expiry))
Example #16
0
    def wait_until_warmed_up(self, master=None):
        if not master:
            master = self.input.servers[0]

        bucket = self.param("bucket", "default")

        fn = RebalanceHelper.wait_for_mc_stats_no_timeout
        for bucket in self.buckets:
            RebalanceHelper.wait_for_stats_on_all(master, bucket,
                                                  'ep_warmup_thread',
                                                  'complete', fn=fn)
Example #17
0
    def _test_cluster_topology_change_body(self):
        bucket = "default"
        BucketOperationHelper.create_bucket(serverInfo=self.master, test_case=self)
        ready = BucketOperationHelper.wait_for_memcached(self.master, bucket)
        self.assertTrue(ready, "wait_for_memcached failed")
        self.add_nodes_and_rebalance()

        rest = RestConnection(self.master)

        distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05}

        inserted_keys, rejected_keys = MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.master],
                                                                                             ram_load_ratio=1,
                                                                                             value_size_distribution=distribution,
                                                                                             moxi=True,
                                                                                             write_only=True,
                                                                                             number_of_threads=2)

        self.log.info("Sleep after data load")
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")

        #let's create a unique folder in the remote location
        for server in self.servers:
            shell = RemoteMachineShellConnection(server)
            output, error = shell.execute_command(self.perm_command)
            shell.log_command_output(output, error)
            node = RestConnection(server).get_nodes_self()
            BackupHelper(server, self).backup(bucket, node, self.remote_tmp_folder)
            shell.disconnect()

        ClusterOperationHelper.cleanup_cluster(self.servers)
        BucketOperationHelper.delete_all_buckets_or_assert(self.servers, self)

        servers = []
        for i in range(0, len(self.servers) - 1):
            servers.append(self.servers[i])

        self.add_node_and_rebalance(servers[0], servers)

        BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self)
        BucketOperationHelper.create_bucket(serverInfo=self.master, test_case=self)

        ready = BucketOperationHelper.wait_for_memcached(self.master, bucket)
        self.assertTrue(ready, "wait_for_memcached failed")

        for server in self.servers:
            BackupHelper(server, self).restore(self.remote_tmp_folder)
            time.sleep(10)

        BucketOperationHelper.verify_data(self.master, inserted_keys, False, False, 11210, self)
Example #18
0
 def load_data(self, master, bucket, keys_count):
     log = logger.Logger.get_logger()
     inserted_keys_cnt = 0
     while inserted_keys_cnt < keys_count:
         keys_cnt, rejected_keys_cnt = MemcachedClientHelper.load_bucket(
             servers=[master], name=bucket, number_of_items=keys_count, number_of_threads=5, write_only=True
         )
         inserted_keys_cnt += keys_cnt
     log.info("wait until data is completely persisted on the disk")
     RebalanceHelper.wait_for_stats_on_all(master, bucket, "ep_queue_size", 0)
     RebalanceHelper.wait_for_stats_on_all(master, bucket, "ep_flusher_todo", 0)
     return inserted_keys_cnt
Example #19
0
 def load_data(master, bucket, keys_count=-1, load_ratio=-1):
     log = logger.Logger.get_logger()
     inserted_keys, rejected_keys =\
     MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[master],
                                                           name=bucket,
                                                           ram_load_ratio=load_ratio,
                                                           number_of_items=keys_count,
                                                           number_of_threads=2,
                                                           write_only=True)
     log.info("wait until data is completely persisted on the disk")
     RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_queue_size', 0)
     RebalanceHelper.wait_for_stats_on_all(master, bucket, 'ep_flusher_todo', 0)
     return inserted_keys
Example #20
0
 def test_vbucket_uuid(self):
     """
         Test to show usage of vbucket information collection via api
         and than comparison and running the logic for analysis
         This is done for cluster and node level as well
     """
     self.gen_create = BlobGenerator('loadOne', 'loadOne_', self.value_size, end=self.num_items)
     self._load_all_buckets(self.master, self.gen_create, "create", 0,
                            batch_size=10000, pause_secs=10, timeout_secs=60)
     self._wait_for_stats_all_buckets(self.servers)
     RebalanceHelper.wait_for_replication(self.servers, self.cluster)
     vbucket_stats=self.get_vbucket_seqnos(self.servers,self.buckets, perNode =  True)
     logic,output = self.compare_per_node_maps(vbucket_stats)
     self.assertTrue(logic, output)
Example #21
0
    def wait_until_repl(self):
        print "[perf.repl] waiting for replication: %s"\
            % time.strftime(PerfDefaults.strftime)

        master = self.input.servers[0]
        bucket = self.param("bucket", "default")

        RebalanceHelper.wait_for_stats_on_all(master, bucket,
            'vb_replica_queue_size', 0,
            fn=RebalanceHelper.wait_for_stats_no_timeout)

        RebalanceHelper.wait_for_stats_on_all(master, bucket,
            'ep_tap_replica_queue_itemondisk', 0,
            fn=RebalanceHelper.wait_for_stats_no_timeout)

        RebalanceHelper.wait_for_stats_on_all(master, bucket,
            'ep_tap_rebalance_queue_backfillremaining', 0,
            fn=RebalanceHelper.wait_for_stats_no_timeout)

        RebalanceHelper.wait_for_stats_on_all(master, bucket,
            'ep_tap_replica_qlen', 0,
            fn=RebalanceHelper.wait_for_stats_no_timeout)

        print "[perf.repl] replication is done: %s"\
            % time.strftime(PerfDefaults.strftime)
 def load_data(self, master, bucket, keys_count):
     log = logger.Logger.get_logger()
     #        gen_create = BlobGenerator("loadONE", "loadONE-", 256, start=0, end=keys_count)
     #        BaseTestCase._load_all_buckets(master, gen_create, "create", 0)
     inserted_keys_cnt = 0
     while inserted_keys_cnt < keys_count:
         keys_cnt, rejected_keys_cnt = MemcachedClientHelper.load_bucket(
             servers=[master], name=bucket, number_of_items=keys_count, number_of_threads=5, write_only=True
         )
         inserted_keys_cnt += keys_cnt
     log.info("wait until data is completely persisted on the disk")
     RebalanceHelper.wait_for_stats_on_all(master, bucket, "ep_queue_size", 0)
     RebalanceHelper.wait_for_stats_on_all(master, bucket, "ep_flusher_todo", 0)
     return inserted_keys_cnt
Example #23
0
    def _test_view_on_multiple_docs(self, num_docs, params={"stale":"update_after"}, delay=10):
        self.log.info("description : create a view on {0} documents".format(num_docs))
        master = self.servers[0]
        rest = RestConnection(master)
        bucket = "default"
        view_name = "dev_test_view_on_{1}_docs-{0}".format(str(uuid.uuid4())[:7], self.num_docs)
        map_fn = "function (doc) {if(doc.name.indexOf(\"" + view_name + "\") != -1) { emit(doc.name, doc);}}"
        rest.create_view(view_name, bucket, [View(view_name, map_fn, dev_view=False)])
        self.created_views[view_name] = bucket
        rest = RestConnection(self.servers[0])
        smart = VBucketAwareMemcached(rest, bucket)
        doc_names = []
        prefix = str(uuid.uuid4())[:7]
        total_time = 0
        self.log.info("inserting {0} json objects".format(num_docs))
        for i in range(0, num_docs):
            key = doc_name = "{0}-{1}-{2}".format(view_name, prefix, i)
            doc_names.append(doc_name)
            value = {"name": doc_name, "age": 1000}
            smart.set(key, 0, 0, json.dumps(value))
        self.log.info("inserted {0} json documents".format(len(doc_names)))
        time.sleep(10)
        results = ViewBaseTests._get_view_results(self, rest, bucket, view_name, len(doc_names), extra_params=params)
        view_time = results['view_time']

        keys = ViewBaseTests._get_keys(self, results)

        RebalanceHelper.wait_for_persistence(master, bucket, 0)

        total_time = view_time
        # Keep trying this for maximum 5 minutes
        start_time = time.time()
        # increase timeout to 600 seconds for windows testing
        while (len(keys) != len(doc_names)) and (time.time() - start_time < 900):
            msg = "view returned {0} items , expected to return {1} items"
            self.log.info(msg.format(len(keys), len(doc_names)))
            self.log.info("trying again in {0} seconds".format(delay))
            time.sleep(delay)
            results = ViewBaseTests._get_view_results(self, rest, bucket, view_name, len(doc_names), extra_params=params)
            view_time = results['view_time']
            total_time += view_time
            keys = ViewBaseTests._get_keys(self, results)

        self.log.info("View time: {0} secs".format(total_time))

        # Only if the lengths are not equal, look for missing keys
        if len(keys) != len(doc_names):
            not_found = list(set(doc_names) - set(keys))
            ViewBaseTests._print_keys_not_found(self, not_found, 10)
            self.fail("map function did not return docs for {0} keys".format(len(not_found)))
Example #24
0
    def _test_backup_add_restore_bucket_with_expiration_key(self, replica):
        bucket = "default"
        rest = RestConnection(self.master)
        info = rest.get_nodes_self()
        size = int(info.memoryQuota * 2.0 / 3.0)
        rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=info.moxi, replicaNumber=replica)
        BucketOperationHelper.wait_for_memcached(self.master, bucket)
        client = MemcachedClientHelper.direct_client(self.master, bucket)
        expiry = 60
        test_uuid = uuid.uuid4()
        keys = ["key_%s_%d" % (test_uuid, i) for i in range(5000)]
        self.log.info("pushing keys with expiry set to {0}".format(expiry))
        for key in keys:
            try:
                client.set(key, expiry, 0, key)
            except mc_bin_client.MemcachedError as error:
                msg = "unable to push key : {0} to bucket : {1} error : {2}"
                self.log.error(msg.format(key, client.vbucketId, error.status))
                self.fail(msg.format(key, client.vbucketId, error.status))
        client.close()
        self.log.info("inserted {0} keys with expiry set to {1}".format(len(keys), expiry))
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        node = RestConnection(self.master).get_nodes_self()

        output, error = self.shell.execute_command(self.perm_command)
        self.shell.log_command_output(output, error)
        backupHelper = BackupHelper(self.master, self)
        backupHelper.backup(bucket, node, self.remote_tmp_folder)

        BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self)
        rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=info.moxi)
        BucketOperationHelper.wait_for_memcached(self.master, bucket)
        backupHelper.restore(self.remote_tmp_folder)
        time.sleep(60)
        client = MemcachedClientHelper.direct_client(self.master, bucket)
        self.log.info('verifying that all those keys have expired...')
        for key in keys:
            try:
                client.get(key=key)
                msg = "expiry was set to {0} but key: {1} did not expire after waiting for {2}+ seconds"
                self.fail(msg.format(expiry, key, expiry))
            except mc_bin_client.MemcachedError as error:
                self.assertEquals(error.status, 1,
                                  msg="expected error code {0} but saw error code {1}".format(1, error.status))
        client.close()
        self.log.info("verified that those keys inserted with expiry set to {0} have expired".format(expiry))
Example #25
0
    def _test_backup_and_restore_from_to_different_buckets(self):
        bucket_before_backup = "bucket_before_backup"
        bucket_after_backup = "bucket_after_backup"
        BucketOperationHelper.create_bucket(serverInfo=self.master, name=bucket_before_backup, port=11212,
                                            test_case=self)
        ready = BucketOperationHelper.wait_for_memcached(self.master, bucket_before_backup)
        self.assertTrue(ready, "wait_for_memcached failed")

        self.add_nodes_and_rebalance()

        distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05}
        inserted_keys, rejected_keys = MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.master],
                                                                                             name=bucket_before_backup,
                                                                                             ram_load_ratio=20,
                                                                                             value_size_distribution=distribution,
                                                                                             write_only=True,
                                                                                             moxi=True,
                                                                                             number_of_threads=2)

        self.log.info("Sleep after data load")
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket_before_backup, 'ep_queue_size', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket_before_backup, 'ep_flusher_todo', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")

        for server in self.servers:
            shell = RemoteMachineShellConnection(server)
            output, error = shell.execute_command(self.perm_command)
            shell.log_command_output(output, error)
            node = RestConnection(server).get_nodes_self()
            BackupHelper(server, self).backup(bucket_before_backup, node, self.remote_tmp_folder)
            shell.disconnect()

        BucketOperationHelper.delete_bucket_or_assert(self.master, bucket_before_backup, self)
        BucketOperationHelper.create_bucket(serverInfo=self.master, name=bucket_after_backup, port=11212,
                                            test_case=self)
        ready = BucketOperationHelper.wait_for_memcached(self.master, bucket_after_backup)
        self.assertTrue(ready, "wait_for_memcached failed")

        for server in self.servers:
            BackupHelper(server, self).restore(self.remote_tmp_folder, moxi_port=11212)
            time.sleep(10)

        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket_after_backup, 'ep_queue_size', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket_after_backup, 'ep_flusher_todo', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        self.assertTrue(BucketOperationHelper.verify_data(self.master, inserted_keys, False, False, 11212, debug=False,
                                                          bucket=bucket_after_backup), "Missing keys")
Example #26
0
 def test_failoverlogs_extraction_equals(self):
     """
         Test to show usage of failover log collection via api
         and than comparison and running the logic for analysis
         This is done for cluster and node level as well
     """
     self.gen_create = BlobGenerator('loadOne', 'loadOne_', self.value_size, end=self.num_items)
     self._load_all_buckets(self.master, self.gen_create, "create", 0,
                            batch_size=10000, pause_secs=10, timeout_secs=60)
     self._wait_for_stats_all_buckets(self.servers)
     RebalanceHelper.wait_for_replication(self.servers, self.cluster)
     failovers_stats=self.get_failovers_logs(self.servers,self.buckets, perNode =  True)
     self.compare_failovers_logs(failovers_stats,self.servers,self.buckets, perNode =  True)
     failovers_stats=self.get_failovers_logs(self.servers,self.buckets,perNode =  False)
     self.compare_failovers_logs(failovers_stats,self.servers,self.buckets,perNode =  False)
Example #27
0
    def verification_phase(test, master):
        # Stop loaders
        SwapRebalanceBase.stop_load(test.loaders)
        test.log.info("DONE DATA ACCESS PHASE")

        test.log.info("VERIFICATION PHASE")
        rest = RestConnection(master)
        servers_in_cluster = []
        nodes = rest.get_nodes()
        for server in test.servers:
            for node in nodes:
                if node.ip == server.ip:
                    servers_in_cluster.append(server)
        RebalanceHelper.wait_for_replication(servers_in_cluster, test.cluster_helper)
        SwapRebalanceBase.items_verification(test, master)
Example #28
0
    def _common_test_body(self):
        master = self.servers[0]
        rest = RestConnection(master)
        bucket_data = RebalanceBaseTest.bucket_data_init(rest)

        self.log.info("INTIAL LOAD")
        RebalanceBaseTest.load_all_buckets_task(rest, self.task_manager, bucket_data, self.load_ratio,
            keys_count=self.keys_count)

        rebalance_out = False
        for server in self.servers[1:]:
            if rebalance_out:
                # Pick a node to rebalance out, other than master
                ejectedNodes = [RebalanceHelper.pick_node(master)]
            else:
                ejectedNodes = []
            current_nodes = RebalanceHelper.getOtpNodeIds(master)
            self.log.info("current nodes : {0}".format(current_nodes))
            self.log.info("adding node {0}, removing node {1} and rebalance afterwards".format(server.ip,
                [node.ip for node in ejectedNodes]))

            self.log.info("START PARALLEL LOAD")
            RebalanceBaseTest.tasks_for_buckets(rest, self.task_manager, bucket_data,
                DELETE_RATIO=self.delete_ratio,
                ACCESS_RATIO=self.access_ratio, EXPIRY_RATIO=self.expiry_ratio)

            self.log.info("INCREMENTAL REBALANCE IN/OUT")
            # rebalance in/out a server
            RebalanceTaskHelper.add_rebalance_task(self.task_manager,
                [master],
                [server],
                ejectedNodes, do_stop=self.do_stop)
            # wait for loading tasks to finish
            RebalanceBaseTest.finish_all_bucket_tasks(rest, bucket_data)

            # Make sure we have at least 3 nodes, for replica=2
            if len(current_nodes) > 2:
                rebalance_out = True

        if self.do_verify:
            self.log.info("VERIFICATION")
            RebalanceBaseTest.do_kv_and_replica_verification(master,
                self.task_manager,
                bucket_data,
                self.replica,
                self)
        else:
            self.log.info("NO VERIFICATION")
Example #29
0
    def populate_alternated(self, num_vbuckets, docs):
        """Every vBucket gets a doc first

        Populating the vBuckets alternated means that every vBucket gets
        a document first, before it receives the second one and so on.

        For example if we have 6 documents named doc-1 ... doc-6 and 3
        vBuckets the result will be:

            vbucket-1: doc-1, doc-4
            vbucket-2: doc-2, doc-5
            vbucket-3: doc-3, doc-6
        """
        for i, doc in enumerate(docs):
            self.insert_into_vbucket(i % num_vbuckets, doc)
        RebalanceHelper.wait_for_persistence(self.master, self.bucket, 0)
    def rebalance_in_out_at_once_persistence_stopped(self):
        num_nodes_with_stopped_persistence = self.input.param("num_nodes_with_stopped_persistence", 1)
        servs_init = self.servers[:self.nodes_init]
        servs_in = [self.servers[i + self.nodes_init] for i in range(self.nodes_in)]
        servs_out = [self.servers[self.nodes_init - i - 1] for i in range(self.nodes_out)]
        rest = RestConnection(self.master)
        self._wait_for_stats_all_buckets(servs_init)
        for server in servs_init[:min(num_nodes_with_stopped_persistence, self.nodes_init)]:
            shell = RemoteMachineShellConnection(server)
            for bucket in self.buckets:
                shell.execute_cbepctl(bucket, "stop", "", "", "")
        self.sleep(5)
        self.num_items_without_persistence = self.input.param("num_items_without_persistence", 100000)
        gen_extra = BlobGenerator('mike', 'mike-', self.value_size, start=self.num_items / 2\
                                      , end=self.num_items / 2 + self.num_items_without_persistence)
        self.log.info("current nodes : {0}".format([node.id for node in rest.node_statuses()]))
        self.log.info("adding nodes {0} to cluster".format(servs_in))
        self.log.info("removing nodes {0} from cluster".format(servs_out))
        tasks = self._async_load_all_buckets(self.master, gen_extra, "create", 0, batch_size=1000)
        result_nodes = set(servs_init + servs_in) - set(servs_out)
        # wait timeout in 60 min because MB-7386 rebalance stuck
        self.cluster.rebalance(servs_init[:self.nodes_init], servs_in, servs_out, timeout=self.wait_timeout * 60)
        for task in tasks:
            task.result()

        self._wait_for_stats_all_buckets(servs_init[:self.nodes_init - self.nodes_out], \
                                         ep_queue_size=self.num_items_without_persistence * 0.9, ep_queue_size_cond='>')
        self._wait_for_stats_all_buckets(servs_in)
        self._verify_all_buckets(self.master, timeout=None)
        self._verify_stats_all_buckets(result_nodes)
        #verify that curr_items_tot corresponds to sum of curr_items from all nodes
        verified = True
        for bucket in self.buckets:
            verified &= RebalanceHelper.wait_till_total_numbers_match(self.master, bucket)
        self.assertTrue(verified, "Lost items!!! Replication was completed but sum(curr_items) don't match the curr_items_total")
Example #31
0
    def test_rebalance_in_out_with_failover_addback_recovery(self):
        """
        Rebalances nodes out and in with failover and full/delta recovery add back of a node
        Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone'
        param to have nodes divided into server groups by having zone > 1.

        This test begins by loading a given number of items into the cluster. It then
        removes one node, rebalances that node out the cluster, and then rebalances it back
        in. During the rebalancing we update all of the items in the cluster. Once the
        node has been removed and added back we  wait for the disk queues to drain, and
        then verify that there has been no data loss, sum(curr_items) match the curr_items_total.
        We then remove and add back two nodes at a time and so on until we have reached the point
        where we are adding back and removing at least half of the nodes.
        """
        recovery_type = self.input.param("recoveryType", "full")
        gen = BlobGenerator('mike',
                            'mike-',
                            self.value_size,
                            end=self.num_items)
        self._load_all_buckets(self.master, gen, "create", 0)
        tasks = self._async_load_all_buckets(self.master, gen, "update", 0)
        servs_in = self.servers[self.nodes_init:self.nodes_init +
                                self.nodes_in]
        servs_out = self.servers[self.nodes_init -
                                 self.nodes_out:self.nodes_init]
        for task in tasks:
            task.result(self.wait_timeout * 20)
        self._verify_stats_all_buckets(self.servers[:self.nodes_init],
                                       timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.nodes_init])
        self.sleep(20)
        prev_vbucket_stats = self.get_vbucket_seqnos(
            self.servers[:self.nodes_init], self.buckets)
        prev_failover_stats = self.get_failovers_logs(
            self.servers[:self.nodes_init], self.buckets)
        disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(
            self.servers[:self.nodes_init], self.buckets, path=None)
        self.compare_vbucketseq_failoverlogs(prev_vbucket_stats,
                                             prev_failover_stats)
        self.rest = RestConnection(self.master)
        self.nodes = self.get_nodes(self.master)
        result_nodes = list(
            set(self.servers[:self.nodes_init] + servs_in) - set(servs_out))
        for node in servs_in:
            self.rest.add_node(self.master.rest_username,
                               self.master.rest_password, node.ip, node.port)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
        # Mark Node for failover
        self.sleep(30)
        success_failed_over = self.rest.fail_over(chosen[0].id, graceful=False)
        # Mark Node for full recovery
        if success_failed_over:
            self.rest.set_recovery_type(otpNode=chosen[0].id,
                                        recoveryType=recovery_type)
        self.sleep(30)
        try:
            self.shuffle_nodes_between_zones_and_rebalance(servs_out)
        except Exception as e:
            if "deltaRecoveryNotPossible" not in e.__str__():
                self.fail(
                    "Rebalance did not fail. Rebalance has to fail since no delta recovery should be possible"
                    " while adding nodes too")
Example #32
0
    def test_rebalance_in_out_at_once_persistence_stopped(self):
        """
        PERFORMANCE:Rebalance in/out at once with stopped persistence.

        This test begins by loading a given number of items into the cluster
        with self.nodes_init nodes in it. Then we stop persistence on some
        nodes. Test starts  to update some data and load new data in the
        cluster. At that time we add  servs_in nodes and remove servs_out nodes
        and start rebalance. After rebalance and data ops are completed we
        start verification phase: wait for the disk queues to drain,
        verify the number of items that were/or not persisted
        with expected values, verify that there has been no data loss,
        sum(curr_items) match the curr_items_total.Once All checks passed,
        test is finished.
        Available parameters by default are:
        nodes_init=1, nodes_in=1,
        nodes_out=1, num_nodes_with_stopped_persistence=1
        num_items_without_persistence=100000
        """
        num_nodes_with_stopped_persistence = self.input.param(
            "num_nodes_with_stopped_persistence", 1)
        servs_init = self.servers[:self.nodes_init]
        servs_in = [
            self.servers[i + self.nodes_init] for i in range(self.nodes_in)
        ]
        servs_out = [
            self.servers[self.nodes_init - i - 1]
            for i in range(self.nodes_out)
        ]
        rest = RestConnection(self.master)
        self._wait_for_stats_all_buckets(servs_init)
        for server in servs_init[:min(num_nodes_with_stopped_persistence, self.
                                      nodes_init)]:
            shell = RemoteMachineShellConnection(server)
            for bucket in self.buckets:
                shell.execute_cbepctl(bucket, "stop", "", "", "")
        self.sleep(5)
        self.num_items_without_persistence = self.input.param(
            "num_items_without_persistence", 100000)
        gen_extra = BlobGenerator('mike',
                                  'mike-',
                                  self.value_size,
                                  start=self.num_items // 2,
                                  end=self.num_items // 2 +
                                  self.num_items_without_persistence)
        self.log.info("current nodes : {0}".format(
            [node.id for node in rest.node_statuses()]))
        self.log.info("adding nodes {0} to cluster".format(servs_in))
        self.log.info("removing nodes {0} from cluster".format(servs_out))
        tasks = self._async_load_all_buckets(self.master,
                                             gen_extra,
                                             "create",
                                             0,
                                             batch_size=1000)
        result_nodes = set(servs_init + servs_in) - set(servs_out)
        # wait timeout in 60 min because MB-7386 rebalance stuck
        self.cluster.rebalance(
            servs_init[:self.nodes_init],
            servs_in,
            servs_out,
            timeout=self.wait_timeout * 60,
            sleep_before_rebalance=self.sleep_before_rebalance)
        for task in tasks:
            task.result()

        # Validate seq_no snap_start/stop values after rebalance
        self.check_snap_start_corruption()

        self._wait_for_stats_all_buckets(
            servs_init[:self.nodes_init - self.nodes_out],
            ep_queue_size=self.num_items_without_persistence * 0.9,
            ep_queue_size_cond='>')
        self._wait_for_stats_all_buckets(servs_in)
        self._verify_all_buckets(self.master, timeout=None)
        self._verify_stats_all_buckets(result_nodes)
        # verify that curr_items_tot corresponds to sum of curr_items from all nodes
        verified = True
        for bucket in self.buckets:
            verified &= RebalanceHelper.wait_till_total_numbers_match(
                self.master, bucket)
        self.assertTrue(
            verified,
            "Lost items!!! Replication was completed but sum(curr_items) don't match the curr_items_total"
        )
        self.verify_unacked_bytes_all_buckets()
Example #33
0
    def _do_warmup(self, howmany, timeout_in_seconds=1800):
        # max_time is in micro seconds
        self._insert_data(howmany)
        if int(howmany) < 50:
            self.log.info("sleep 10 seconds for small number items insert correctly into bucket")
            time.sleep(10)
        curr_items = int(self.onenodemc.stats()["curr_items"])
        uptime = int(self.onenodemc.stats()["uptime"])
        RebalanceHelper.wait_for_persistence(self.master, "default")
        self.log.info("sleeping for 10 seconds")
        time.sleep(10)
        rest = RestConnection(self.master)
        command = "try ns_server_testrunner_api:kill_memcached(20000) catch _:_ -> [erlang:exit(element(2, X), kill) || X <- supervisor:which_children(ns_port_sup)] end."
        memcached_restarted, content = rest.diag_eval(command)
        self.assertTrue(memcached_restarted, "unable to restart memcached/moxi process through diag/eval")

        #wait until memcached starts
        start = time.time()
        memcached_restarted = False
        while time.time() - start < 60:
            try:
                self.onenodemc = MemcachedClientHelper.direct_client(self.master, "default")
                value = int(self.onenodemc.stats()["uptime"])
                if value < uptime:
                    self.log.info("memcached restarted...")
                    memcached_restarted = True
                    break
                self.onenodemc.close()
                # The uptime stat have a 1 sec resolution so there is no point of
                # retrying more often
                time.sleep(1)
            except Exception:
                time.sleep(1)

        self.assertTrue(memcached_restarted, "memcached restarted and uptime is now reset")

        # Warmup till curr_items match
        self.onenodemc = MemcachedClientHelper.direct_client(self.master, "default")
        stats = self.onenodemc.stats()
        present_count = int(stats["curr_items"])
        ep_warmup_thread = stats["ep_warmup_thread"]
        self.log.info("ep curr_items : {0}, inserted_items {1} directly after kill_memcached ".format(present_count, curr_items))
        self.log.info("ep_warmup_thread directly after kill_memcached: {0}".format(ep_warmup_thread))
        start = time.time()
        while ep_warmup_thread != "complete":
            if (time.time() - start) <= timeout_in_seconds:
                stats = self.onenodemc.stats()
                present_count = int(stats["curr_items"])
                ep_warmup_thread = stats["ep_warmup_thread"]
                self.log.warn("curr_items {0}, ep_warmup_thread {1}".format(present_count, ep_warmup_thread))
                time.sleep(1)
            else:
                self.fail("Timed out waiting for warmup")

        stats = self.onenodemc.stats()
        present_count = int(stats["curr_items"])
        if present_count < curr_items:
            self.log.error("Warmup failed. Got {0} and expected {1} items".format(present_count, curr_items))
            self.fail("Warmup failed. Incomplete number of messages after killing memcached")

        if "ep_warmup_time" not in stats:
            self.log.error("'ep_warmup_time' was not found in stats:{0}".format(stats))
        warmup_time = int(stats["ep_warmup_time"])
        self.log.info("ep_warmup_time is {0}".format(warmup_time))
Example #34
0
    def test_rebalance_inout_with_durability_failure(self):
        """
        Perform irregular number of in_out nodes
        1. Swap-out 'self.nodes_out' nodes
        2. Add nodes using 'self.nodes_in' such that,
           replica_number > nodes_in_cluster
        3. Perform swap-rebalance
        4. Make sure durability is not broken due to swap-rebalance
        5. Add make a node and do CRUD on the bucket
        6. Verify durability works after node addition

        Note: This is a Negative case. i.e: Durability will be broken
        """
        master = self.cluster.master
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        def_bucket = self.bucket_util.buckets[0]

        # TODO: Enable verification
        """
        vbucket_info_dict = dict()

        # Cb stat object for verification purpose
        master_shell_conn = RemoteMachineShellConnection(master)
        master_node_cb_stat = Cbstats(master_shell_conn)

        # Update each vbucket's seq_no for latest value for verification
        for vb_num in range(0, self.vbuckets):
            vbucket_info_dict[vb_num] = master_node_cb_stat.vbucket_seqno(
                def_bucket.name, vb_num, "abs_high_seqno")
        """

        # Rest connection to add/rebalance/monitor nodes
        rest = RestConnection(master)

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.nodes_out)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        if self.swap_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            if self.nodes_out is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.nodes_in]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        if self.do_access:
            self.log.info("DATA ACCESS PHASE")
            self.loaders = self.start_access_phase()

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        if self.do_stop_start:
            # Rebalance is stopped at 20%, 40% and 60% completion
            retry = 0
            for expected_progress in (20, 40, 60):
                self.log.info(
                    "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%".
                    format(expected_progress))
                while True:
                    progress = rest._rebalance_progress()
                    if progress < 0:
                        self.log.error(
                            "rebalance progress code : {0}".format(progress))
                        break
                    elif progress == 100:
                        self.log.warn("Rebalance has already reached 100%")
                        break
                    elif progress >= expected_progress:
                        self.log.info(
                            "Rebalance will be stopped with {0}%".format(
                                progress))
                        stopped = rest.stop_rebalance()
                        self.assertTrue(stopped,
                                        msg="unable to stop rebalance")
                        self.sleep(20)
                        rest.rebalance(otpNodes=[
                            node.id for node in rest.node_statuses()
                        ],
                                       ejectedNodes=optNodesIds)
                        break
                    elif retry > 100:
                        break
                    else:
                        retry += 1
                        self.sleep(1)
        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                optNodesIds))
        # TODO: There will be failure in doc_count verification due to
        # swap_rebalance. Need to update verification steps accordingly to
        # satisfy this
        self.verification_phase()

        # Add back first ejected node back into the cluster
        self.task.rebalance(self.cluster.nodes_in_cluster,
                            [toBeEjectedNodes[0]], [])

        # Load doc into all vbuckets to verify durability
        gen_create = doc_generator('test_', 0, self.num_items)
        task = self.task.async_load_gen_docs_atomicity(
            self.cluster,
            def_bucket,
            gen_create,
            self.op_type,
            exp=0,
            batch_size=10,
            process_concurrency=8,
            replicate_to=self.replicate_to,
            persist_to=self.persist_to,
            timeout_secs=self.sdk_timeout,
            retries=self.sdk_retries,
            transaction_timeout=self.transaction_timeout,
            commit=self.transaction_commit)
        self.task_manager.get_task_result(task)
Example #35
0
 def end_rebalance(master):
     RebalanceHelper.end_rebalance(master)
Example #36
0
    def run_test(self):
        ep_threshold = self.input.param("ep_threshold", "ep_mem_low_wat")
        active_resident_threshold = int(
            self.input.param("active_resident_threshold", 10))

        mc = MemcachedClientHelper.direct_client(self.servers[0],
                                                 self.bucket_name)
        stats = mc.stats()
        threshold = int(self.input.param('threshold', stats[ep_threshold]))
        threshold_reached = False
        self.num_items = self.input.param("items", 10000)
        self._load_doc_data_all_buckets('create')

        # load items till reached threshold or mem-ratio is less than resident ratio threshold
        while not threshold_reached:
            mem_used = int(mc.stats()["mem_used"])
            if mem_used < threshold or int(mc.stats(
            )["vb_active_perc_mem_resident"]) >= active_resident_threshold:
                self.log.info(
                    "mem_used and vb_active_perc_mem_resident_ratio reached at %s/%s and %s "
                    % (mem_used, threshold,
                       mc.stats()["vb_active_perc_mem_resident"]))
                items = self.num_items
                self.num_items += self.input.param("items", 10000)
                self._load_doc_data_all_buckets('create', items)
            else:
                threshold_reached = True
                self.log.info("DGM state achieved!!!!")

        # wait for draining of data before restart and warm up
        for bucket in self.buckets:
            RebalanceHelper.wait_for_persistence(self.nodes_server[0],
                                                 bucket,
                                                 bucket_type=self.bucket_type)

        while True:

            #            read_data_task = self.cluster.async_verify_data(self.master, self.buckets[0], self.buckets[0].kvs[1])

            read_data_task = Thread(target=self._run_get)
            read_data_task.start()
            #5 threads to run stats all and reset asynchronously
            start = time.time()
            while (time.time() - start) < 300:

                stats_all_thread = []
                stats_reset_thread = []

                for i in range(self.threads_to_run):
                    stat_str = ''
                    stats_all_thread.append(
                        Thread(target=self._get_stats, args=[stat_str]))
                    stats_all_thread[i].start()
                    stat_str = 'reset'
                    stats_reset_thread.append(
                        Thread(target=self._get_stats, args=[stat_str]))
                    stats_reset_thread[i].start()

                for i in range(self.threads_to_run):
                    stats_all_thread[i].join()
                    stats_reset_thread[i].join()

                del stats_all_thread
                del stats_reset_thread

#            read_data_task.result()
            read_data_task.join()
Example #37
0
    def _failover_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings
        num_initial_servers = self.num_initial_servers
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        self.create_buckets()

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, _ = RebalanceHelper.rebalance_in(intial_severs,
                                                 len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = self.start_load_phase()

        # Wait till load phase is over
        self.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        self.log.info("current nodes : {0}".format(
            RebalanceHelper.getOtpNodeIds(master)))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(
            master, howmany=self.failover_factor)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.fail_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            optNodesIds[0] = content

        self.log.info("FAILOVER PHASE")
        # Failover selected nodes
        for node in optNodesIds:
            self.log.info(
                "failover node {0} and rebalance afterwards".format(node))
            rest.fail_over(node)

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.failover_factor]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.fail_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = self.start_access_phase()

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                new_swap_servers))

        self.verification_phase()
Example #38
0
 def rebalance_in(servers, how_many, monitor=True):
     return RebalanceHelper.rebalance_in(servers, how_many, monitor)
Example #39
0
    def rebalance_out_with_failover(self):

        self.transaction_timeout = self.input.param("transaction_timeout", 100)
        self.transaction_commit = self.input.param("transaction_commit", True)

        task = self.task.async_load_gen_docs_atomicity(
            self.cluster,
            self.bucket_util.buckets,
            self.gen_load,
            "create",
            0,
            batch_size=20,
            process_concurrency=8,
            replicate_to=self.replicate_to,
            persist_to=self.persist_to,
            timeout_secs=self.sdk_timeout,
            retries=self.sdk_retries,
            transaction_timeout=self.transaction_timeout,
            commit=self.transaction_commit,
            durability=self.durability_level)
        self.task.jython_task_manager.get_task_result(task)
        self.sleep(60, "Task completed")
        fail_over = self.input.param("fail_over", False)
        self.rest = RestConnection(self.cluster.master)
        std = self.std_vbucket_dist or 1.0

        gen_delete = self.get_doc_generator(self.num_items / 2, self.num_items)
        gen_create = self.get_doc_generator(self.num_items + 1,
                                            self.num_items * 3 / 2)
        # define which doc's ops will be performed during rebalancing
        # allows multiple of them but one by one
        tasks = []
        if (self.doc_ops is not None):
            if ("update" in self.doc_ops):
                tasks.append(
                    self.task.async_load_gen_docs_atomicity(
                        self.cluster,
                        self.bucket_util.buckets,
                        self.gen_update,
                        "rebalance_update",
                        0,
                        batch_size=20,
                        process_concurrency=8,
                        replicate_to=self.replicate_to,
                        persist_to=self.persist_to,
                        timeout_secs=self.sdk_timeout,
                        retries=self.sdk_retries,
                        transaction_timeout=self.transaction_timeout,
                        commit=self.transaction_commit,
                        durability=self.durability_level))

            if ("create" in self.doc_ops):
                tasks.append(
                    self.task.async_load_gen_docs_atomicity(
                        self.cluster,
                        self.bucket_util.buckets,
                        gen_create,
                        "create",
                        0,
                        batch_size=20,
                        process_concurrency=8,
                        replicate_to=self.replicate_to,
                        persist_to=self.persist_to,
                        timeout_secs=self.sdk_timeout,
                        retries=self.sdk_retries,
                        transaction_timeout=self.transaction_timeout,
                        commit=self.transaction_commit,
                        durability=self.durability_level))
                self.num_items = self.num_items + 1 + (self.num_items * 3 / 2)

            if ("delete" in self.doc_ops):
                tasks.append(
                    self.task.async_load_gen_docs_atomicity(
                        self.cluster,
                        self.bucket_util.buckets,
                        gen_delete,
                        "rebalance_delete",
                        0,
                        batch_size=20,
                        process_concurrency=8,
                        replicate_to=self.replicate_to,
                        persist_to=self.persist_to,
                        timeout_secs=self.sdk_timeout,
                        retries=self.sdk_retries,
                        transaction_timeout=self.transaction_timeout,
                        commit=self.transaction_commit,
                        durability=self.durability_level))

                self.num_items = self.num_items - (self.num_items / 2)
            for task in tasks:
                self.task.jython_task_manager.get_task_result(task)

        ejectedNode = self.cluster_util.find_node_info(
            self.cluster.master, self.cluster.servers[self.nodes_init - 1])

        self.sleep(100, "Sleep for 100 seconds")

        prev_failover_stats = self.bucket_util.get_failovers_logs(
            self.cluster.servers[:self.nodes_init], self.bucket_util.buckets)
        prev_vbucket_stats = self.bucket_util.get_vbucket_seqnos(
            self.cluster.servers[:self.nodes_init], self.bucket_util.buckets)
        record_data_set = self.bucket_util.get_data_set_all(
            self.cluster.servers[:self.nodes_init], self.bucket_util.buckets)
        self.bucket_util.compare_vbucketseq_failoverlogs(
            prev_vbucket_stats, prev_failover_stats)
        self.rest = RestConnection(self.cluster.master)

        chosen = RebalanceHelper.pick_nodes(self.cluster.master, howmany=1)
        new_server_list = self.cluster_util.add_remove_servers(
            self.cluster.servers, self.cluster.servers[:self.nodes_init],
            [self.cluster.servers[self.nodes_init - 1], chosen[0]], [])
        # Mark Node for failover
        success_failed_over = self.rest.fail_over(chosen[0].id,
                                                  graceful=fail_over)
        self.nodes = self.rest.node_statuses()
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[chosen[0].id, ejectedNode.id])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True),
                        msg="Rebalance failed")
        self.cluster.nodes_in_cluster = new_server_list

        self.sleep(60, "Starting data_analaysis_all")
        self.bucket_util.data_analysis_all(record_data_set, new_server_list,
                                           self.bucket_util.buckets)

        self.sleep(60, "Vb_Distribution_Analysis starts")
        nodes = self.cluster_util.get_nodes_in_cluster(self.cluster.master)
        self.bucket_util.vb_distribution_analysis(
            servers=nodes,
            buckets=self.bucket_util.buckets,
            num_replicas=self.num_replicas,
            std=1.0,
            total_vbuckets=self.vbuckets)
Example #40
0
    def test_rebalance_out(self):
        RebalanceBaseTest.common_setup(self._input, self, replica=1)
        log = logger.Logger().get_logger()
        master = self._servers[0]
        num_of_docs = TestInputSingleton.input.param("num_of_docs", 100000)
        replica = TestInputSingleton.input.param("replica", 100000)
        add_items_count = TestInputSingleton.input.param(
            "num_of_creates", 30000)
        size = TestInputSingleton.input.param("item_size", 256)
        params = {
            "sizes": [size],
            "count": num_of_docs,
            "seed": str(uuid.uuid4())[:7]
        }
        rest = RestConnection(master)
        buckets = rest.get_buckets()
        bucket_data = {}
        generators = {}
        for bucket in buckets:
            bucket_data[bucket.name] = {"kv_store": ClientKeyValueStore()}

        rebalanced_in, which_servers = RebalanceBaseTest.rebalance_in(
            self._servers,
            len(self.servers) - 1)
        self.assertTrue(rebalanced_in,
                        msg="unable to add and rebalance more nodes")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=[])
        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding nodes {0}".format(
                [node.id for node in rest.node_statuses()]))
        while len(rest.node_statuses()) > 1:
            #pick a node that is not the master node
            toBeEjectedNode = RebalanceHelper.pick_node(master)
            rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                           ejectedNodes=[toBeEjectedNode.id])
            self.assertTrue(
                rest.monitorRebalance(),
                msg="rebalance operation failed after adding node {0}".format(
                    toBeEjectedNode.id))
            for bucket in buckets:
                kv_store = bucket_data[bucket.name]["kv_store"]
                add_items_seed = str(uuid.uuid4())[:7]
                self._add_items(add_items_seed, bucket, add_items_count,
                                kv_store)
                errors = RebalanceDataGenerator.do_verification(
                    kv_store, rest, bucket.name)
                if errors:
                    log.error("verification returned {0} errors".format(
                        len(errors)))
                load_set_ops = {"ops": "set", "bucket": bucket.name}
                load_set_ops.update(params)
                load_delete_ops = {
                    "ops": "delete",
                    "bucket": bucket.name,
                    "sizes": [size],
                    "count": add_items_count / 5,
                    "seed": add_items_seed
                }
                thread = RebalanceDataGenerator.start_load(
                    rest, bucket.name,
                    RebalanceDataGenerator.create_loading_tasks(load_set_ops),
                    kv_store)
                generators["set"] = {"thread": thread}
                #restart three times
                generators["set"]["thread"].start()
                thread = RebalanceDataGenerator.start_load(
                    rest, bucket.name,
                    RebalanceDataGenerator.create_loading_tasks(
                        load_delete_ops), kv_store)
                generators["delete"] = {"thread": thread}
                generators["delete"]["thread"].start()
            self.log.info("current nodes : {0}".format(
                [node.id for node in rest.node_statuses()]))

            for bucket in buckets:
                kv_store = bucket_data[bucket.name]["kv_store"]
                errors = RebalanceDataGenerator.do_verification(
                    kv_store, rest, bucket.name)
                if errors:
                    log.error("verification returned {0} errors".format(
                        len(errors)))
            generators["set"]["thread"].join()
            generators["delete"]["thread"].join()
            for bucket in buckets:
                kv_store = bucket_data[bucket.name]["kv_store"]
                bucket_data[bucket.name]["items_inserted_count"] = len(
                    kv_store.valid_items())
                RebalanceBaseTest.replication_verification(
                    master, bucket_data, replica, self)
Example #41
0
    def common_test_body(self, failover_reason):
        """
            Main Test body which contains the flow of the failover basic steps
            1. Starts Operations if programmed into the test case(before/after)
            2. Start View and Index Building operations
            3. Failover K out of N nodes (failover can be HARD/GRACEFUL)
            4.1 Rebalance the cluster is failover of K nodeStatuses
            4.2 Run Add-Back operation with recoveryType = (full/delta) with rebalance
            5. Verify all expected operations completed by checking stats,
               replication, views, data correctness
        """
        # Pick the reference node for communication
        # We pick a node in the cluster which will NOT be failed over
        self.filter_list = []
        if self.failoverMaster:
            self.master = self.servers[1]
        self.log.info(
            "Picking node {0} as reference node for test case".format(
                self.master.ip))
        self.print_test_params(failover_reason)
        self.rest = RestConnection(self.master)
        self.nodes = self.rest.node_statuses()
        # Set the data path for the cluster
        self.data_path = self.rest.get_data_path()

        # Check if the test case has to be run for 3.0.0
        versions = self.rest.get_nodes_versions()
        self.version_greater_than_2_5 = True
        for version in versions:
            if "3" > version:
                self.version_greater_than_2_5 = False

        # Do not run this this test if graceful category is being used
        if not self.version_greater_than_2_5 and (self.graceful or
                                                  (self.recoveryType != None)):
            self.log.error(
                "Graceful failover can't be applied to nodes with version less then 3.*"
            )
            self.log.error(
                "Please check configuration parameters: SKIPPING TEST.")
            return

        # Find nodes that will under go failover
        if self.failoverMaster:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=1, target_node=self.servers[0])
        else:
            self.chosen = RebalanceHelper.pick_nodes(
                self.master, howmany=self.num_failed_nodes)

        # Perform operations - Create/Update/Delete
        # self.withMutationOps = True => Run Operations in parallel to failover
        # self.withMutationOps = False => Run Operations Before failover
        self.load_initial_data()
        if not self.withMutationOps:
            self.run_mutation_operations()
        # Perform view creation tasks and wait for completion before failover
        if self.withViewsOps:
            self.run_view_creation_operations(self.servers)
            if not self.createIndexesDuringFailover:
                self.query_and_monitor_view_tasks(self.servers)

        # Validate seq_no snap_start/stop values
        self.check_snap_start_corruption()

        # Take snap-shot of data set used for validaiton
        record_static_data_set = dict()
        prev_vbucket_stats = dict()
        prev_failover_stats = dict()
        if not self.withMutationOps:
            record_static_data_set = self.get_data_set_all(self.servers,
                                                           self.buckets,
                                                           path=None)

        # Capture  vbucket and failover stats if test version >= 2.5.*
        if self.version_greater_than_2_5 and self.upr_check:
            prev_vbucket_stats = self.get_vbucket_seqnos(
                self.servers, self.buckets)
            prev_failover_stats = self.get_failovers_logs(
                self.servers, self.buckets)

        # Perform Operations related to failover
        if self.withMutationOps or self.withViewsOps or self.compact:
            self.run_failover_operations_with_ops(self.chosen, failover_reason)
        else:
            self.run_failover_operations(self.chosen, failover_reason)

        # TODO: Enable this even when 'flusher_batch_split_trigger' is not set
        if self.flusher_batch_split_trigger and \
                self.num_replicas >= self.num_failed_nodes:
            tasks = self._async_load_all_buckets(self.master, self.gen_update,
                                                 "update", 0)
            for task in tasks:
                task.result()

        if self.graceful:
            # Validate seq_no snap_start/stop values
            self.check_snap_start_corruption()

        # Add back + rebalance / only rebalance with verification
        if not self.gracefulFailoverFail and self.runRebalanceAfterFailover:
            if self.add_back_flag:
                self.run_add_back_operation_and_verify(self.chosen,
                                                       prev_vbucket_stats,
                                                       record_static_data_set,
                                                       prev_failover_stats)
            else:
                self.run_rebalance_after_failover_and_verify(
                    self.chosen, prev_vbucket_stats, record_static_data_set,
                    prev_failover_stats)

        if self.graceful:
            # Validate seq_no snap_start/stop values
            self.check_snap_start_corruption()

        if self.during_ops is None:
            self.verify_unacked_bytes_all_buckets(filter_list=self.filter_list,
                                                  master_node=self.master)
Example #42
0
    def _test_backup_add_restore_bucket_body(self,
                                             bucket,
                                             delay_after_data_load,
                                             startup_flag,
                                             single_node):
        server = self.master
        rest = RestConnection(server)
        info = rest.get_nodes_self()
        size = int(info.memoryQuota * 2.0 / 3.0)
        if bucket == "default":
            rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=info.moxi)
        else:
            proxyPort = info.moxi + 500
            rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=proxyPort,
                               authType="sasl", saslPassword="******")

        ready = BucketOperationHelper.wait_for_memcached(server, bucket)
        self.assertTrue(ready, "wait_for_memcached failed")
        if not single_node:
            self.add_nodes_and_rebalance()
        distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05}
        inserted_keys, rejected_keys = MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.master],
                                                                                             name=bucket,
                                                                                             ram_load_ratio=1,
                                                                                             value_size_distribution=distribution,
                                                                                             moxi=True,
                                                                                             write_only=True,
                                                                                             number_of_threads=2)

        if not single_node:
            rest = RestConnection(self.master)
            self.assertTrue(RestHelper(rest).wait_for_replication(180), msg="replication did not complete")

        self.log.info("Sleep {0} seconds after data load".format(delay_after_data_load))
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        node = RestConnection(self.master).get_nodes_self()
        if not startup_flag:
            for server in self.servers:
                shell = RemoteMachineShellConnection(server)
                shell.stop_membase()
                shell.stop_couchbase()
                shell.disconnect()

        output, error = self.shell.execute_command(self.perm_command)
        self.shell.log_command_output(output, error)

        #now let's back up
        BackupHelper(self.master, self).backup(bucket, node, self.remote_tmp_folder)

        if not startup_flag:
            for server in self.servers:
                shell = RemoteMachineShellConnection(server)
                shell.start_membase()
                shell.start_couchbase()
                RestHelper(RestConnection(server)).is_ns_server_running()
                shell.disconnect()

        BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self)

        if bucket == "default":
            rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=info.moxi)
        else:
            proxyPort = info.moxi + 500
            rest.create_bucket(bucket, ramQuotaMB=size, proxyPort=proxyPort,
                               authType="sasl", saslPassword="******")
        BucketOperationHelper.wait_for_memcached(self.master, bucket)

        if bucket == "default":
            BackupHelper(self.master, self).restore(backup_location=self.remote_tmp_folder, moxi_port=info.moxi)
        else:
            BackupHelper(self.master, self).restore(backup_location=self.remote_tmp_folder, moxi_port=info.moxi, username=bucket, password='******')

        keys_exist = BucketOperationHelper.keys_exist_or_assert_in_parallel(inserted_keys, self.master, bucket, self, concurrency=4)
        self.assertTrue(keys_exist, msg="unable to verify keys after restore")
    def test_start_stop_rebalance_after_failover(self):
        """
            Rebalances nodes out and in with failover
            Use different nodes_in and nodes_out params to have uneven add and
            deletion. Use 'zone' param to have nodes divided into server groups
            by having zone > 1.

            The test begin with loading the bucket with given number of items.
            It then fails over a node. We then rebalance the cluster,
            while adding or removing given number of nodes.
            Once the rebalance reaches 50%, we stop the rebalance and validate
            the cluster stats. We then restart the rebalance and
            validate rebalance was completed successfully.
            """
        fail_over = self.input.param("fail_over", False)
        gen = BlobGenerator('mike',
                            'mike-',
                            self.value_size,
                            end=self.num_items)
        self._load_all_buckets(self.master, gen, "create", 0)
        tasks = self._async_load_all_buckets(self.master, gen, "update", 0)
        for task in tasks:
            task.result(self.wait_timeout * 20)
        self._verify_stats_all_buckets(self.servers[:self.nodes_init],
                                       timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.nodes_init])
        self.sleep(20)

        # Validate seq_no snap_start/stop values before rebalance
        self.check_snap_start_corruption()

        prev_vbucket_stats = self.get_vbucket_seqnos(
            self.servers[:self.nodes_init], self.buckets)
        prev_failover_stats = self.get_failovers_logs(
            self.servers[:self.nodes_init], self.buckets)
        _, _ = self.get_and_compare_active_replica_data_set_all(
            self.servers[:self.nodes_init], self.buckets, path=None)
        self.compare_vbucketseq_failoverlogs(prev_vbucket_stats,
                                             prev_failover_stats)
        self.rest = RestConnection(self.master)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
        result_nodes = list(
            set(self.servers[:self.nodes_init] + self.servs_in) -
            set(self.servs_out))
        for node in self.servs_in:
            self.rest.add_node(self.master.rest_username,
                               self.master.rest_password, node.ip, node.port)
        # Mark Node for failover
        self.rest.fail_over(chosen[0].id, graceful=fail_over)

        # Doc_mutation after failing over the nodes
        tasks = self._async_load_all_buckets(self.master, gen, "update", 0)
        for task in tasks:
            task.result(self.wait_timeout * 20)

        # Validate seq_no snap_start/stop values after failover
        self.check_snap_start_corruption()

        self.cluster.async_rebalance(self.servers[:self.nodes_init],
                                     self.servs_in, self.servs_out)
        expected_progress = 50
        rest = RestConnection(self.master)
        reached = RestHelper(rest).rebalance_reached(expected_progress)
        self.assertTrue(
            reached,
            "Rebalance failed or did not reach {0}%".format(expected_progress))
        if not RestHelper(rest).is_cluster_rebalanced():
            self.log.info("Stop the rebalance")
            stopped = rest.stop_rebalance(wait_timeout=self.wait_timeout / 3)
            self.assertTrue(stopped, msg="Unable to stop rebalance")
            self._verify_all_buckets(self.master,
                                     timeout=None,
                                     max_verify=self.max_verify,
                                     batch_size=1)
        self.shuffle_nodes_between_zones_and_rebalance()
        self.verify_cluster_stats(result_nodes,
                                  check_ep_items_remaining=True,
                                  check_bucket_stats=False)
        self.sleep(30)
        self.verify_unacked_bytes_all_buckets()
        nodes = self.get_nodes_in_cluster(self.master)
        self.vb_distribution_analysis(servers=nodes,
                                      std=1.0,
                                      total_vbuckets=self.total_vbuckets)

        # Validate seq_no snap_start/stop values after rebalance
        self.check_snap_start_corruption()
Example #44
0
 def verify_data(master, inserted_keys, bucket, test):
     log = logger.Logger.get_logger()
     log.info("Verifying data")
     ready = RebalanceHelper.wait_for_persistence(master, bucket)
     BucketOperationHelper.keys_exist_or_assert_in_parallel(keys=inserted_keys, server=master, bucket_name=bucket,
         test=test, concurrency=4)
Example #45
0
    def _add_back_failed_node(self, do_node_cleanup=False):
        master = self.servers[0]
        rest = RestConnection(master)
        creds = self.input.membase_settings

        self.log.info("CREATE BUCKET PHASE")
        self.create_buckets()

        # Cluster all servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, _ = RebalanceHelper.rebalance_in(self.servers,
                                                 len(self.servers) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = self.start_load_phase()

        # Wait till load phase is over
        self.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(
            master, howmany=self.failover_factor)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        # List of servers that will not be failed over
        not_failed_over = []
        for server in self.servers:
            if self.cluster_run:
                if server.port not in [node.port for node in toBeEjectedNodes]:
                    not_failed_over.append(server)
                    self.log.info("Node {0}:{1} not failed over".format(
                        server.ip, server.port))
            else:
                if server.ip not in [node.ip for node in toBeEjectedNodes]:
                    not_failed_over.append(server)
                    self.log.info("Node {0}:{1} not failed over".format(
                        server.ip, server.port))

        if self.fail_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content
            master = not_failed_over[-1]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = self.start_access_phase()

        # Failover selected nodes
        for node in optNodesIds:
            self.log.info(
                "failover node {0} and rebalance afterwards".format(node))
            rest.fail_over(node)

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                optNodesIds))

        # Add back the same failed over nodes

        # Cleanup the node, somehow
        # TODO: cluster_run?
        if do_node_cleanup:
            pass

        # Make rest connection with node part of cluster
        rest = RestConnection(master)

        # Given the optNode, find ip
        add_back_servers = []
        nodes = rest.get_nodes()
        for server in nodes:
            if isinstance(server.ip, unicode):
                add_back_servers.append(server)
        final_add_back_servers = []
        for server in self.servers:
            if self.cluster_run:
                if server.port not in [serv.port for serv in add_back_servers]:
                    final_add_back_servers.append(server)
            else:
                if server.ip not in [serv.ip for serv in add_back_servers]:
                    final_add_back_servers.append(server)
        for server in final_add_back_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=[])

        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                add_back_servers))

        self.verification_phase()
Example #46
0
 def rebalance_in(servers, how_many):
     return RebalanceHelper.rebalance_in(servers, how_many)
Example #47
0
    def test_rebalance_inout_with_durability_check(self):
        """
        Perform irregular number of in_out nodes
        1. Swap-out 'self.nodes_out' nodes
        2. Add 'self.nodes_in' nodes into the cluster
        3. Perform swap-rebalance
        4. Make sure durability is not broken due to swap-rebalance

        Note: This is a Positive case. i.e: Durability should not be broken
        """
        master = self.cluster.master
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        def_bucket = self.bucket_util.buckets[0]

        # Update replica value before performing rebalance in/out
        if self.replica_to_update:
            bucket_helper = BucketHelper(self.cluster.master)

            # Recalculate replicate_to/persist_to as per new replica value
            if self.self.durability_level is None:
                self.replicate_to = floor(self.replica_to_update / 2) + 1
                self.persist_to = floor(self.replica_to_update / 2) + 2

            # Update bucket replica to new value as given in conf file
            self.log.info("Updating replica count of bucket to {0}".format(
                self.replica_to_update))
            bucket_helper.change_bucket_props(
                def_bucket.name, replicaNumber=self.replica_to_update)

        # Rest connection to add/rebalance/monitor nodes
        rest = RestConnection(master)

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.nodes_out)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        if self.swap_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            if self.nodes_out is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.nodes_in]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        if self.do_access:
            self.log.info("DATA ACCESS PHASE")
            self.loaders = self.start_access_phase()

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        if self.do_stop_start:
            # Rebalance is stopped at 20%, 40% and 60% completion
            retry = 0
            for expected_progress in (20, 40, 60):
                self.log.info(
                    "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%".
                    format(expected_progress))
                while True:
                    progress = rest._rebalance_progress()
                    if progress < 0:
                        self.log.error(
                            "rebalance progress code : {0}".format(progress))
                        break
                    elif progress == 100:
                        self.log.warn("Rebalance has already reached 100%")
                        break
                    elif progress >= expected_progress:
                        self.log.info(
                            "Rebalance will be stopped with {0}%".format(
                                progress))
                        stopped = rest.stop_rebalance()
                        self.assertTrue(stopped,
                                        msg="unable to stop rebalance")
                        self.sleep(20)
                        rest.rebalance(otpNodes=[
                            node.id for node in rest.node_statuses()
                        ],
                                       ejectedNodes=optNodesIds)
                        break
                    elif retry > 100:
                        break
                    else:
                        retry += 1
                        self.sleep(1)
        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                optNodesIds))
        self.verification_phase()
Example #48
0
    def _common_test_body(self):
        master = self.servers[0]
        rest = RestConnection(master)

        # start load, max_ops_per_second is the combined limit for all buckets
        buckets = rest.get_buckets()
        loaders = []
        self.log.info("max-ops-per-second per bucket: {0}".format(self.max_ops_per_second / len(buckets)))
        for bucket in buckets:
            loader = {}
            loader["mcsoda"] = LoadWithMcsoda(master, self.keys_count, prefix='', bucket=bucket.name,
                password=bucket.saslPassword, protocol='membase-binary')
            loader["mcsoda"].cfg["max-ops"] = 0
            loader["mcsoda"].cfg["max-ops-per-sec"] = self.max_ops_per_second / len(buckets)
            loader["mcsoda"].cfg["exit-after-creates"] = 0
            loader["mcsoda"].cfg["min-value-size"] = self.min_item_size
            loader["mcsoda"].cfg["json"] = 0
            loader["mcsoda"].cfg["batch"] = 100
            loader["thread"] = Thread(target=loader["mcsoda"].load_data, name='mcloader_' + bucket.name)
            loader["thread"].daemon = True
            loaders.append(loader)

        for loader in loaders:
            loader["thread"].start()

        for iteration in range(self.repeat):
            for server in self.servers[1:]:
                self.log.info("iteration {0}: ".format(iteration))
                self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master)))
                self.log.info("adding node {0} and rebalance afterwards".format(server.ip))

                rebalance_done = False
                rebalance_try = 0
                while not rebalance_done:
                    try:
                        ClusterOperationHelper.begin_rebalance_in(master, [server])
                        ClusterOperationHelper.end_rebalance(master)
                        rebalance_done = True
                    except AssertionError as e:
                        rebalance_try += 1
                        self.log.error(e)
                        time.sleep(5)
                        if rebalance_try > 5:
                            raise e

            for server in self.servers[1:]:
                self.log.info("current nodes : {0}".format(RebalanceHelper.getOtpNodeIds(master)))
                self.log.info("removing node {0} and rebalance afterwards".format(server.ip))

                rebalance_done = False
                rebalance_try = 0
                while not rebalance_done:
                    try:
                        ClusterOperationHelper.begin_rebalance_out(master, [server])
                        ClusterOperationHelper.end_rebalance(master)
                        rebalance_done = True
                    except AssertionError as e:
                        rebalance_try += 1
                        self.log.error(e)
                        time.sleep(5)
                        if rebalance_try > 5:
                            raise e

        # stop load
        for loader in loaders:
            loader["mcsoda"].load_stop()

        for loader in loaders:
            loader["thread"].join()
Example #49
0
 def begin_rebalance_out(master, servers, timeout=5):
     RebalanceHelper.begin_rebalance_out(master, servers, timeout)
    def test_backup_upgrade_restore_default(self):
        if len(self.servers) < 2:
            self.log.error("At least 2 servers required for this test ..")
            return
        original_set = copy.copy(self.servers)
        worker = self.servers[len(self.servers) - 1]
        self.servers = self.servers[:len(self.servers) - 1]
        shell = RemoteMachineShellConnection(self.master)
        o, r = shell.execute_command("cat /opt/couchbase/VERSION.txt")
        fin = o[0]
        shell.disconnect()
        initial_version = self.input.param("initial_version", fin)
        final_version = self.input.param("final_version", fin)
        if initial_version == final_version:
            self.log.error("Same initial and final versions ..")
            return
        if not final_version.startswith('2.0'):
            self.log.error("Upgrade test not set to run from 1.8.1 -> 2.0 ..")
            return
        builds, changes = BuildQuery().get_all_builds(version=final_version)
        product = 'couchbase-server-enterprise'
        #CASE where the worker isn't a 2.0+
        worker_flag = 0
        shell = RemoteMachineShellConnection(worker)
        o, r = shell.execute_command("cat /opt/couchbase/VERSION.txt")
        temp = o[0]
        if not temp.startswith('2.0'):
            worker_flag = 1
        if worker_flag == 1:
            self.log.info("Loading version {0} on worker.. ".format(final_version))
            remote = RemoteMachineShellConnection(worker)
            info = remote.extract_remote_info()
            older_build = BuildQuery().find_build(builds, product, info.deliverable_type,
                                                  info.architecture_type, final_version)
            remote.stop_couchbase()
            remote.couchbase_uninstall()
            remote.download_build(older_build)
            remote.install_server(older_build)
            remote.disconnect()

        remote_tmp = "{1}/{0}".format("backup", "/root")
        perm_comm = "mkdir -p {0}".format(remote_tmp)
        if not initial_version == fin:
            for server in self.servers:
                remote = RemoteMachineShellConnection(server)
                info = remote.extract_remote_info()
                self.log.info("Loading version ..  {0}".format(initial_version))
                older_build = BuildQuery().find_build(builds, product, info.deliverable_type,
                                                      info.architecture_type, initial_version)
                remote.stop_couchbase()
                remote.couchbase_uninstall()
                remote.download_build(older_build)
                remote.install_server(older_build)
                rest = RestConnection(server)
                RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT)
                rest.init_cluster(server.rest_username, server.rest_password)
                rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved)
                remote.disconnect()

        self.common_setUp()
        bucket = "default"
        if len(self.servers) > 1:
            self.add_nodes_and_rebalance()
        rest = RestConnection(self.master)
        info = rest.get_nodes_self()
        size = int(info.memoryQuota * 2.0 / 3.0)
        rest.create_bucket(bucket, ramQuotaMB=size)
        ready = BucketOperationHelper.wait_for_memcached(self.master, bucket)
        self.assertTrue(ready, "wait_for_memcached_failed")
        distribution = {10: 0.2, 20: 0.5, 30: 0.25, 40: 0.05}
        inserted_keys, rejected_keys = MemcachedClientHelper.load_bucket_and_return_the_keys(servers=[self.master],
                                                                                             name=bucket,
                                                                                             ram_load_ratio=0.5,
                                                                                             value_size_distribution=distribution,
                                                                                             moxi=True,
                                                                                             write_only=True,
                                                                                             delete_ratio=0.1,
                                                                                             number_of_threads=2)
        if len(self.servers) > 1:
            rest = RestConnection(self.master)
            self.assertTrue(RebalanceHelper.wait_for_replication(rest.get_nodes(), timeout=180),
                            msg="replication did not complete")

        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_queue_size', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        ready = RebalanceHelper.wait_for_stats_on_all(self.master, bucket, 'ep_flusher_todo', 0)
        self.assertTrue(ready, "wait_for ep_queue_size == 0 failed")
        node = RestConnection(self.master).get_nodes_self()
        shell = RemoteMachineShellConnection(worker)
        o, r = shell.execute_command(perm_comm)
        shell.log_command_output(o, r)
        shell.disconnect()

        #Backup
        #BackupHelper(self.master, self).backup(bucket, node, remote_tmp)
        shell = RemoteMachineShellConnection(worker)
        shell.execute_command("/opt/couchbase/bin/cbbackup http://{0}:{1} {2}".format(
                                                            self.master.ip, self.master.port, remote_tmp))
        shell.disconnect()
        BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self)
        time.sleep(30)

        #Upgrade
        for server in self.servers:
            self.log.info("Upgrading to current version {0}".format(final_version))
            remote = RemoteMachineShellConnection(server)
            info = remote.extract_remote_info()
            new_build = BuildQuery().find_build(builds, product, info.deliverable_type,
                                                info.architecture_type, final_version)
            remote.stop_couchbase()
            remote.couchbase_uninstall()
            remote.download_build(new_build)
            remote.install_server(new_build)
            rest = RestConnection(server)
            RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT)
            rest.init_cluster(server.rest_username, server.rest_password)
            rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved)
            remote.disconnect()
        time.sleep(30)

        #Restore
        rest = RestConnection(self.master)
        info = rest.get_nodes_self()
        size = int(info.memoryQuota * 2.0 / 3.0)
        rest.create_bucket(bucket, ramQuotaMB=size)
        ready = BucketOperationHelper.wait_for_memcached(server, bucket)
        self.assertTrue(ready, "wait_for_memcached_failed")
        #BackupHelper(self.master, self).restore(backup_location=remote_tmp, moxi_port=info.moxi)
        shell = RemoteMachineShellConnection(worker)
        shell.execute_command("/opt/couchbase/bin/cbrestore {2} http://{0}:{1} -b {3}".format(
                                                            self.master.ip, self.master.port, remote_tmp, bucket))
        shell.disconnect()
        time.sleep(60)
        keys_exist = BucketOperationHelper.keys_exist_or_assert_in_parallel(inserted_keys, self.master, bucket, self, concurrency=4)
        self.assertTrue(keys_exist, msg="unable to verify keys after restore")
        time.sleep(30)
        BucketOperationHelper.delete_bucket_or_assert(self.master, bucket, self)
        rest = RestConnection(self.master)
        helper = RestHelper(rest)
        nodes = rest.node_statuses()
        master_id = rest.get_nodes_self().id
        if len(self.servers) > 1:
                removed = helper.remove_nodes(knownNodes=[node.id for node in nodes],
                                          ejectedNodes=[node.id for node in nodes if node.id != master_id],
                                          wait_for_rebalance=True)

        shell = RemoteMachineShellConnection(worker)
        shell.remove_directory(remote_tmp)
        shell.disconnect()

        self.servers = copy.copy(original_set)
        if initial_version == fin:
            builds, changes = BuildQuery().get_all_builds(version=initial_version)
            for server in self.servers:
                remote = RemoteMachineShellConnection(server)
                info = remote.extract_remote_info()
                self.log.info("Loading version ..  {0}".format(initial_version))
                older_build = BuildQuery().find_build(builds, product, info.deliverable_type,
                                                      info.architecture_type, initial_version)
                remote.stop_couchbase()
                remote.couchbase_uninstall()
                remote.download_build(older_build)
                remote.install_server(older_build)
                rest = RestConnection(server)
                RestHelper(rest).is_ns_server_running(testconstants.NS_SERVER_TIMEOUT)
                rest.init_cluster(server.rest_username, server.rest_password)
                rest.init_cluster_memoryQuota(memoryQuota=rest.get_nodes_self().mcdMemoryReserved)
                remote.disconnect()
Example #51
0
    def test_rebalance_in_out_with_failover(self):
        """
        Rebalances nodes out and in with failover
        Use different nodes_in and nodes_out params to have uneven add and deletion. Use 'zone'
        param to have nodes divided into server groups by having zone > 1.

        This test begins by loading a given number of items into the cluster. It then
        removes one node, rebalances that node out the cluster, and then rebalances it back
        in. During the rebalancing we update all of the items in the cluster. Once the
        node has been removed and added back we  wait for the disk queues to drain, and
        then verify that there has been no data loss, sum(curr_items) match the curr_items_total.
        We then remove and add back two nodes at a time and so on until we have reached the point
        where we are adding back and removing at least half of the nodes.
        """
        fail_over = self.input.param("fail_over", False)
        gen = BlobGenerator('mike',
                            'mike-',
                            self.value_size,
                            end=self.num_items)
        self._load_all_buckets(self.master, gen, "create", 0)
        tasks = self._async_load_all_buckets(self.master, gen, "update", 0)
        servs_in = self.servers[self.nodes_init:self.nodes_init +
                                self.nodes_in]
        servs_out = self.servers[self.nodes_init -
                                 self.nodes_out:self.nodes_init]
        for task in tasks:
            task.result(self.wait_timeout * 20)
        # Validate seq_no snap_start/stop values after initial doc_load
        self.check_snap_start_corruption()

        self._verify_stats_all_buckets(self.servers[:self.nodes_init],
                                       timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.nodes_init])
        self.sleep(20)
        prev_vbucket_stats = self.get_vbucket_seqnos(
            self.servers[:self.nodes_init], self.buckets)
        prev_failover_stats = self.get_failovers_logs(
            self.servers[:self.nodes_init], self.buckets)
        disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(
            self.servers[:self.nodes_init], self.buckets, path=None)
        self.compare_vbucketseq_failoverlogs(prev_vbucket_stats,
                                             prev_failover_stats)
        self.rest = RestConnection(self.master)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
        result_nodes = list(
            set(self.servers[:self.nodes_init] + servs_in) - set(servs_out))
        for node in servs_in:
            self.rest.add_node(self.master.rest_username,
                               self.master.rest_password, node.ip, node.port)

        # Load data after add-node
        self._load_all_buckets(self.master, gen, "update", 0)
        # Validate seq_no snap_start/stop values
        self.check_snap_start_corruption()

        # Mark Node for failover
        self.rest.fail_over(chosen[0].id, graceful=fail_over)

        # Load data after failover
        self._load_all_buckets(self.master, gen, "update", 0)
        # Validate seq_no snap_start/stop values
        self.check_snap_start_corruption()

        # No need to pass self.sleep_before_rebalance,
        # since prev ops are synchronous call
        self.shuffle_nodes_between_zones_and_rebalance(servs_out)
        # Validate seq_no snap_start/stop values after rebalance
        self.check_snap_start_corruption()

        self.verify_cluster_stats(result_nodes, check_ep_items_remaining=True)
        self.compare_failovers_logs(prev_failover_stats, result_nodes,
                                    self.buckets)
        self.sleep(30)
        self.data_analysis_active_replica_all(disk_active_dataset,
                                              disk_replica_dataset,
                                              result_nodes,
                                              self.buckets,
                                              path=None)
        self.verify_unacked_bytes_all_buckets()
        nodes = self.get_nodes_in_cluster(self.master)
        self.vb_distribution_analysis(servers=nodes,
                                      std=1.0,
                                      total_vbuckets=self.total_vbuckets)
Example #52
0
    def _common_test_body(self):
        master = self.servers[0]
        rest = RestConnection(master)
        bucket_data = RebalanceBaseTest.bucket_data_init(rest)

        # add all servers
        self.log.info("Initially rebalancing in the nodes")
        RebalanceTaskHelper.add_rebalance_task(self.task_manager,
            [master],
            self.servers[1:],
            [], monitor=True, do_stop=self.do_stop)

        self.log.info("Initial loading of data")
        RebalanceBaseTest.load_all_buckets_task(rest, self.task_manager,
            bucket_data, self.load_ratio,
            keys_count=self.keys_count)

        nodes = rest.node_statuses()

        for node in nodes[1:]:
            # Get the current cluster size, we will continnue fail-over till current_cluster_size= replica+1
            current_cluster_len = len(rest.node_statuses())
            if current_cluster_len < (self.replica + 1):
                self.log.info(
                    "Replica count {0} is greater than the current cluster-size{1}, stopping failover test.".format(
                        self.replica, current_cluster_len))

            else:
                # Never pick master node
                if node.ip != master.ip:
                    self.log.info("Starting Parallel Load ..")
                    RebalanceBaseTest.tasks_for_buckets(rest, self.task_manager, bucket_data,
                        DELETE_RATIO=self.delete_ratio,
                        ACCESS_RATIO=self.access_ratio, EXPIRY_RATIO=self.expiry_ratio)

                    # Pick a Node to failover
                    toBeEjectedNode = RebalanceHelper.pick_node(master)
                    self.log.info("Starting Failover and Rebalance Out  for node {0}:{1}".format(toBeEjectedNode.ip,
                        toBeEjectedNode.port))

                    # rebalance Out
                    RebalanceTaskHelper.add_failover_task(self.task_manager,
                        [master],
                        [toBeEjectedNode], True)

                    self.log.info(
                        "Completed Failover for node {0}:{1}".format(toBeEjectedNode.ip, toBeEjectedNode.port))
                    # rebalance Out
                    RebalanceTaskHelper.add_rebalance_task(self.task_manager,
                        [master],
                        [],
                        [toBeEjectedNode], do_stop=self.do_stop, monitor=True)

                    # wait for all tasks to finish
                    RebalanceBaseTest.finish_all_bucket_tasks(rest, bucket_data)
                    self.log.info("Completed Load, Failover and Rebalance Out. ")

                    # verification step
                    if self.do_verify:
                        self.log.info("Verifying with KV-store")
                        RebalanceBaseTest.do_kv_and_replica_verification(master, self.task_manager,
                            bucket_data, self.replica, self, failed_over=True)
                    else:
                        self.log.info("No verification with KV-store specified")
                        # at least 2 nodes required per loop to rebalance out and verify replication
            self.log.info("Completed Load and Rebalance-Out")
Example #53
0
    def common_test_body(self, keys_count, replica, failover_reason):
        log = logger.Logger.get_logger()
        log.info("keys_count : {0}".format(keys_count))
        log.info("replicas : {0}".format(replica))
        log.info("failover_reason : {0}".format(failover_reason))
        log.info('picking server : {0} as the master'.format(self.master))

        self._load_all_buckets(self.master,
                               self.gen_create,
                               "create",
                               0,
                               batch_size=10000,
                               pause_secs=5,
                               timeout_secs=180)
        self._wait_for_stats_all_buckets(self._servers)

        _servers_ = self._servers
        rest = RestConnection(self.master)
        nodes = rest.node_statuses()

        self._wait_for_replication(self._servers, timeout=600)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=replica)
        for node in chosen:
            #let's do op
            if failover_reason == 'stop_server':
                self.stop_server(node)
                log.info(
                    "10 seconds delay to wait for membase-server to shutdown")
                #wait for 5 minutes until node is down
                self.assertTrue(
                    RestHelper(rest).wait_for_node_status(
                        node, "unhealthy", 300),
                    msg=
                    "node status is not unhealthy even after waiting for 5 minutes"
                )
            elif failover_reason == "firewall":
                RemoteUtilHelper.enable_firewall(
                    self._servers, node, bidirectional=self.bidirectional)
                status = RestHelper(rest).wait_for_node_status(
                    node, "unhealthy", 300)
                if status:
                    log.info("node {0}:{1} is 'unhealthy' as expected".format(
                        node.ip, node.port))
                else:
                    #verify iptables on the node if something wrong
                    for server in self._servers:
                        if server.ip == node.ip:
                            shell = RemoteMachineShellConnection(server)
                            o, r = shell.execute_command(
                                "/sbin/iptables --list")
                            shell.log_command_output(o, r)
                            shell.disconnect()
                    self.assertTrue(
                        status,
                        msg=
                        "node status is not unhealthy even after waiting for 5 minutes"
                    )

            failed_over = rest.fail_over(node.id)
            if not failed_over:
                self.log.info(
                    "unable to failover the node the first time. try again in  60 seconds.."
                )
                #try again in 75 seconds
                time.sleep(75)
                failed_over = rest.fail_over(node.id)
            self.assertTrue(
                failed_over,
                "unable to failover node after {0}".format(failover_reason))
            log.info("failed over node : {0}".format(node.id))
            self._failed_nodes.append(node)

        if self.add_back_flag:
            for node in self._failed_nodes:
                rest.add_back_node(node.id)
                time.sleep(5)
            log.info(
                "10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                           ejectedNodes=[])
            msg = "rebalance failed while removing failover nodes {0}".format(
                chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
        else:
            log.info(
                "10 seconds sleep after failover before invoking rebalance...")
            time.sleep(10)
            rest.rebalance(otpNodes=[node.id for node in nodes],
                           ejectedNodes=[node.id for node in chosen])
            msg = "rebalance failed while removing failover nodes {0}".format(
                chosen)
            self.assertTrue(rest.monitorRebalance(stop_if_loop=True), msg=msg)
            for failed in chosen:
                for server in _servers_:
                    if server.ip == failed.ip:
                        _servers_.remove(server)
                        self._cleanup_nodes.append(server)

        log.info("Begin VERIFICATION ...")
        self._wait_for_stats_all_buckets(_servers_)
        self._wait_for_replication(self._servers, timeout=600)
        self._verify_stats_all_buckets(_servers_)
        self._verify_all_buckets(self.master)
Example #54
0
    def rebalance_in_with_failover(self):
        fail_over = self.input.param("fail_over", False)
        gen_update = BlobGenerator('mike',
                                   'mike-',
                                   self.value_size,
                                   end=self.num_items)
        tasks = []
        tasks += self._async_load_all_buckets(self.master, gen_update,
                                              "update", 0)
        for task in tasks:
            task.result()
        servs_in = [
            self.servers[i + self.nodes_init] for i in range(self.nodes_in)
        ]
        self._verify_stats_all_buckets(self.servers[:self.nodes_init],
                                       timeout=120)
        self._wait_for_stats_all_buckets(self.servers[:self.nodes_init])
        self.sleep(20)
        prev_failover_stats = self.get_failovers_logs(
            self.servers[:self.nodes_init], self.buckets)
        prev_vbucket_stats = self.get_vbucket_seqnos(
            self.servers[:self.nodes_init], self.buckets)
        disk_replica_dataset, disk_active_dataset = self.get_and_compare_active_replica_data_set_all(
            self.servers[:self.nodes_init], self.buckets, path=None)
        self.rest = RestConnection(self.master)
        self.nodes = self.get_nodes(self.master)
        chosen = RebalanceHelper.pick_nodes(self.master, howmany=1)
        self.rest = RestConnection(self.master)
        self.rest.add_node(self.master.rest_username,
                           self.master.rest_password,
                           self.servers[self.nodes_init].ip,
                           self.servers[self.nodes_init].port)
        # Mark Node for failover
        self.rest.fail_over(chosen[0].id, graceful=fail_over)
        if fail_over:
            self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True),
                            msg="Graceful Failover Failed")
        self.nodes = self.rest.node_statuses()
        self.rest.rebalance(otpNodes=[node.id for node in self.nodes],
                            ejectedNodes=[chosen[0].id])
        self.assertTrue(self.rest.monitorRebalance(stop_if_loop=True),
                        msg="Rebalance Failed")

        # Verification
        new_server_list = self.add_remove_servers(
            self.servers, self.servers[:self.nodes_init], [chosen[0]],
            [self.servers[self.nodes_init]])
        self._verify_stats_all_buckets(new_server_list, timeout=120)
        self.verify_cluster_stats(new_server_list,
                                  check_ep_items_remaining=True)
        self.compare_failovers_logs(prev_failover_stats, new_server_list,
                                    self.buckets)
        self.sleep(30)
        self.data_analysis_active_replica_all(disk_active_dataset,
                                              disk_replica_dataset,
                                              new_server_list,
                                              self.buckets,
                                              path=None)
        self.verify_unacked_bytes_all_buckets()
        nodes = self.get_nodes_in_cluster(self.master)
        self.vb_distribution_analysis(servers=nodes,
                                      buckets=self.buckets,
                                      std=1.0,
                                      total_vbuckets=self.total_vbuckets)
Example #55
0
    def test_getr(self):
        item_count = self.input.param("item_count", 10000)
        replica_count = self.input.param("replica_count", 1)
        expiration = self.input.param("expiration", 0)
        delay = float(self.input.param("delay", 0))
        eject = self.input.param("eject", 0)
        delete = self.input.param("delete", 0)
        mutate = self.input.param("mutate", 0)
        warmup = self.input.param("warmup", 0)
        skipload = self.input.param("skipload", 0)
        rebalance = self.input.param("rebalance", 0)

        negative_test = False
        if delay > expiration:
            negative_test = True
        if delete and not mutate:
            negative_test = True
        if skipload and not mutate:
            negative_test = True

        prefix = str(uuid.uuid4())[:7]

        BucketOperationHelper.delete_all_buckets_or_assert([self.master], self)
        BucketOperationHelper.create_bucket(self.master, name=self.default_bucket_name, replica=replica_count, port=11210, test_case=self, bucket_ram=-1, password="")

        if rebalance == GetrTests.DURING_REBALANCE or rebalance == GetrTests.AFTER_REBALANCE:
            # leave 1 node unclustered for rebalance in
            ClusterOperationHelper.begin_rebalance_out(self.master, self.servers[-1:])
            ClusterOperationHelper.end_rebalance(self.master)
            ClusterOperationHelper.begin_rebalance_in(self.master, self.servers[:-1])
            ClusterOperationHelper.end_rebalance(self.master)
        else:
            ClusterOperationHelper.begin_rebalance_in(self.master, self.servers)
            ClusterOperationHelper.end_rebalance(self.master)

        vprefix = ""
        if not skipload:
            self._load_items(item_count=item_count, expiration=expiration, prefix=prefix, vprefix=vprefix)
            if not expiration:
                RebalanceHelper.wait_for_stats_int_value(self.master, self.default_bucket_name, "curr_items_tot", item_count * (replica_count + 1), "<=", 600, True)

        if delete:
            self._delete_items(item_count=item_count, prefix=prefix)

        if mutate:
            vprefix = "mutated"
            self._load_items(item_count=item_count, expiration=expiration, prefix=prefix, vprefix=vprefix)

        self.assertTrue(RebalanceHelper.wait_for_replication(self.rest.get_nodes(), timeout=180),
                            msg="replication did not complete")

        if eject:
            self._eject_items(item_count=item_count, prefix=prefix)

        if delay:
            self.sleep(delay)

        if rebalance == GetrTests.DURING_REBALANCE:
            ClusterOperationHelper.begin_rebalance_in(self.master, self.servers)
        if rebalance == GetrTests.AFTER_REBALANCE:
            ClusterOperationHelper.end_rebalance(self.master)
        if warmup:
            self.log.info("restarting memcached")
            command = "rpc:multicall(erlang, apply, [fun () -> try ns_server_testrunner_api:restart_memcached(20000) catch _:_ -> ns_port_sup:restart_port_by_name(memcached) end end, []], 20000)."
            memcached_restarted, content = self.rest.diag_eval(command)
            #wait until memcached starts
            self.assertTrue(memcached_restarted, "unable to restart memcached process through diag/eval")
            RebalanceHelper.wait_for_stats(self.master, self.default_bucket_name, "curr_items_tot", item_count * (replica_count + 1), 600)

        count = self._getr_items(item_count=item_count, replica_count=replica_count, prefix=prefix, vprefix=vprefix)

        if negative_test:
            self.assertTrue(count == 0, "found {0} items, expected none".format(count))
        else:
            self.assertTrue(count == replica_count * item_count, "expected {0} items, got {1} items".format(replica_count * item_count, count))
        if rebalance == GetrTests.DURING_REBALANCE:
            ClusterOperationHelper.end_rebalance(self.master)
Example #56
0
    def _common_test_body_swap_rebalance(self, do_stop_start=False):
        master = self.cluster.master
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[1:num_initial_servers]

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status = self.task.rebalance(self.cluster.servers[:self.nodes_init],
                                     intial_severs, [])
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("CREATE BUCKET PHASE")
        self.create_buckets()

        self.log.info("DATA LOAD PHASE")
        self.loaders = self.start_load_phase()

        # Wait till load phase is over
        self.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]

        if self.swap_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        if self.do_access:
            self.log.info("DATA ACCESS PHASE")
            self.loaders = self.start_access_phase()

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)

        if do_stop_start:
            # Rebalance is stopped at 20%, 40% and 60% completion
            retry = 0
            for expected_progress in (20, 40, 60):
                self.log.info(
                    "STOP/START SWAP REBALANCE PHASE WITH PROGRESS {0}%".
                    format(expected_progress))
                while True:
                    progress = rest._rebalance_progress()
                    if progress < 0:
                        self.log.error(
                            "rebalance progress code : {0}".format(progress))
                        break
                    elif progress == 100:
                        self.log.warn("Rebalance has already reached 100%")
                        break
                    elif progress >= expected_progress:
                        self.log.info(
                            "Rebalance will be stopped with {0}%".format(
                                progress))
                        stopped = rest.stop_rebalance()
                        self.assertTrue(stopped,
                                        msg="unable to stop rebalance")
                        self.sleep(20)
                        rest.rebalance(otpNodes=[
                            node.id for node in rest.node_statuses()
                        ],
                                       ejectedNodes=optNodesIds)
                        break
                    elif retry > 100:
                        break
                    else:
                        retry += 1
                        self.sleep(1)
        self.assertTrue(
            rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(
                optNodesIds))
        self.verification_phase()
Example #57
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        SwapRebalanceBase.create_buckets(self)

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        RebalanceHelper.rebalance_in(intial_severs, len(intial_severs) - 1)

        self.log.info("DATA LOAD PHASE")
        self.loaders = SwapRebalanceBase.start_load_phase(self, master)

        # Wait till load phase is over
        SwapRebalanceBase.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master, howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = ClusterOperationHelper.find_orchestrator(master)
            self.assertTrue(status, msg="Unable to find orchestrator: {0}:{1}".\
            format(status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info("removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password, server.ip)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = SwapRebalanceBase.start_access_phase(self, master)

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
            ejectedNodes=optNodesIds)

        # Rebalance is failed at 20%, 40% and 60% completion
        for i in [1, 2, 3]:
            expected_progress = 20 * i
            self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(expected_progress))
            RestHelper(rest).rebalance_reached(expected_progress)
            bucket = rest.get_buckets()[0].name
            pid = None
            if self.swap_orchestrator:
                # get PID via remote connection if master is a new node
                shell = RemoteMachineShellConnection(master)
                o, _ = shell.execute_command("ps -eo comm,pid | awk '$1 == \"memcached\" { print $2 }'")
                pid = o[0]
                shell.disconnect()
            else:
                for i in xrange(2):
                    try:
                        _mc = MemcachedClientHelper.direct_client(master, bucket)
                        pid = _mc.stats()["pid"]
                        break
                    except EOFError as e:
                        self.log.error("{0}.Retry in 2 sec".format(e))
                        time.sleep(1)
            if pid is None:
                self.fail("impossible to get a PID")
            command = "os:cmd(\"kill -9 {0} \")".format(pid)
            self.log.info(command)
            killed = rest.diag_eval(command)
            self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port, killed))
            self.log.info("sleep for 10 sec after kill memcached")
            time.sleep(10)
            # we can't get stats for new node when rebalance falls
            if not self.swap_orchestrator:
                ClusterOperationHelper._wait_warmup_completed(self, [master], bucket, wait_time=600)
            i = 0
            #we expect that rebalance will be failed
            while rest._rebalance_progress_status() == "running" and i < 60:
                self.log.info("rebalance progress: {0}".format(rest._rebalance_progress()))
                time.sleep(1)
                i += 1
            self.log.info("rebalance progress status:{0}".format(rest._rebalance_progress_status()))
            knownNodes = rest.node_statuses();
            self.log.info("nodes are still in cluster: {0}".format([(node.ip, node.port) for node in knownNodes]))
            ejectedNodes = list(set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes],
                ejectedNodes=ejectedNodes)

        self.assertTrue(rest.monitorRebalance(),
            msg="rebalance operation failed after adding node {0}".format(toBeEjectedNodes))

        SwapRebalanceBase.verification_phase(self, master)
Example #58
0
    def _common_test_body_failed_swap_rebalance(self):
        master = self.servers[0]
        rest = RestConnection(master)
        num_initial_servers = self.num_initial_servers
        creds = self.input.membase_settings
        intial_severs = self.servers[:num_initial_servers]

        self.log.info("CREATE BUCKET PHASE")
        self.create_buckets()

        # Cluster all starting set of servers
        self.log.info("INITIAL REBALANCE PHASE")
        status, _ = RebalanceHelper.rebalance_in(intial_severs,
                                                 len(intial_severs) - 1)
        self.assertTrue(status, msg="Rebalance was failed")

        self.log.info("DATA LOAD PHASE")
        self.loaders = self.start_load_phase()

        # Wait till load phase is over
        self.stop_load(self.loaders, do_stop=False)
        self.log.info("DONE LOAD PHASE")

        # Start the swap rebalance
        current_nodes = RebalanceHelper.getOtpNodeIds(master)
        self.log.info("current nodes : {0}".format(current_nodes))
        toBeEjectedNodes = RebalanceHelper.pick_nodes(master,
                                                      howmany=self.num_swap)
        optNodesIds = [node.id for node in toBeEjectedNodes]
        if self.swap_orchestrator:
            status, content = self.cluster_util.find_orchestrator(master)
            self.assertTrue(status,
                            msg="Unable to find orchestrator: {0}:{1}".format(
                                status, content))
            # When swapping all the nodes
            if self.num_swap is len(current_nodes):
                optNodesIds.append(content)
            else:
                optNodesIds[0] = content

        for node in optNodesIds:
            self.log.info(
                "removing node {0} and rebalance afterwards".format(node))

        new_swap_servers = self.servers[
            num_initial_servers:num_initial_servers + self.num_swap]
        for server in new_swap_servers:
            otpNode = rest.add_node(creds.rest_username, creds.rest_password,
                                    server.ip, server.port)
            msg = "unable to add node {0} to the cluster"
            self.assertTrue(otpNode, msg.format(server.ip))

        if self.swap_orchestrator:
            rest = RestConnection(new_swap_servers[0])
            master = new_swap_servers[0]

        self.log.info("DATA ACCESS PHASE")
        self.loaders = self.start_access_phase()

        self.log.info("SWAP REBALANCE PHASE")
        rest.rebalance(otpNodes=[node.id for node in rest.node_statuses()],
                       ejectedNodes=optNodesIds)
        self.sleep(10, "Rebalance should start")
        self.log.info("FAIL SWAP REBALANCE PHASE @ {0}".format(
            self.percentage_progress))
        reached = RestHelper(rest).rebalance_reached(self.percentage_progress)
        if reached and RestHelper(rest).is_cluster_rebalanced():
            # handle situation when rebalance failed at the beginning
            self.log.error('seems rebalance failed!')
            rest.print_UI_logs()
            self.fail("rebalance failed even before killing memcached")
        bucket = self.bucket_util.buckets[0]
        pid = None
        if self.swap_orchestrator and not self.cluster_run:
            # get PID via remote connection if master is a new node
            shell = RemoteMachineShellConnection(master)
            pid = shell.get_memcache_pid()
            shell.disconnect()
        else:
            times = 2
            if self.cluster_run:
                times = 20
            for _ in xrange(times):
                try:
                    shell = RemoteMachineShellConnection(server)
                    pid = shell.get_memcache_pid()
                    shell.disconnect()
                    break
                except EOFError as e:
                    self.log.error("{0}.Retry in 2 sec".format(e))
                    self.sleep(2)
        if pid is None:
            self.fail("impossible to get a PID")
        command = "os:cmd(\"kill -9 {0} \")".format(pid)
        self.log.info(command)
        killed = rest.diag_eval(command)
        self.log.info("killed {0}:{1}??  {2} ".format(master.ip, master.port,
                                                      killed))
        self.log.info("sleep for 10 sec after kill memcached")
        self.sleep(10)
        # we can't get stats for new node when rebalance falls
        if not self.swap_orchestrator:
            self.bucket_util._wait_warmup_completed([master],
                                                    bucket,
                                                    wait_time=600)
        # we expect that rebalance will be failed
        try:
            rest.monitorRebalance()
        except RebalanceFailedException:
            # retry rebalance if it failed
            self.log.warn("Rebalance failed but it's expected")
            self.sleep(30)
            self.assertFalse(RestHelper(rest).is_cluster_rebalanced(),
                             msg="cluster need rebalance")
            knownNodes = rest.node_statuses()
            self.log.info("nodes are still in cluster: {0}".format([
                (node.ip, node.port) for node in knownNodes
            ]))
            ejectedNodes = list(
                set(optNodesIds) & set([node.id for node in knownNodes]))
            rest.rebalance(otpNodes=[node.id for node in knownNodes],
                           ejectedNodes=ejectedNodes)
            self.assertTrue(
                rest.monitorRebalance(),
                msg="Rebalance failed after adding node {0}".format(
                    toBeEjectedNodes))
        else:
            self.log.info("rebalance completed successfully")
        self.verification_phase()
Example #59
0
 def wait_for_persistence(self, timeout=120):
     RebalanceHelper.wait_for_persistence(self.master, self.bucket, timeout)
Example #60
0
    def do_warmup(self):
        howmany = self.num_of_docs
        self.input = TestInputSingleton.input
        self.servers = self.input.servers
        self._insert_data(howmany)

        RebalanceHelper.wait_for_stats_on_all(self.master, "default",
                                              "ep_queue_size", 0)
        RebalanceHelper.wait_for_stats_on_all(self.master, "default",
                                              "ep_flusher_todo", 0)
        time.sleep(5)
        rest = RestConnection(self.master)

        map = {}
        #collect curr_items from all nodes
        for server in self.servers:
            mc_conn = MemcachedClientHelper.direct_client(server, "default")
            map["{0}:{1}".format(server.ip, server.port)] = {}
            map["{0}:{1}".format(
                server.ip, server.port)]["curr_items_tot"] = mc_conn.stats(
                    "")["curr_items_tot"]
            map["{0}:{1}".format(
                server.ip,
                server.port)]["previous_uptime"] = mc_conn.stats("")["uptime"]

            self.log.info("memcached {0}:{1} has {2} items".format(
                server.ip, server.port,
                mc_conn.stats("")["curr_items_tot"]))
            mc_conn.close()

        # Killing Memcached
        nodes = rest.node_statuses()

        for node in nodes:
            _node = {
                "ip": node.ip,
                "port": node.port,
                "username": self.servers[0].rest_username,
                "password": self.servers[0].rest_password
            }
            _mc = MemcachedClientHelper.direct_client(_node, "default")
            pid = _mc.stats()["pid"]
            node_rest = RestConnection(_node)
            command = "os:cmd(\"kill -9 {0} \")".format(pid)
            self.log.info(command)
            killed = node_rest.diag_eval(command)
            self.log.info("killed ??  {0} ".format(killed))
            _mc.close()

        start = time.time()

        memcached_restarted = False
        for server in self.servers:
            mc = None
            while time.time() - start < 60:
                try:
                    mc = MemcachedClientHelper.direct_client(server, "default")
                    stats = mc.stats()

                    new_uptime = int(stats["uptime"])
                    if new_uptime < map["{0}:{1}".format(
                            server.ip, server.port)]["previous_uptime"]:
                        self.log.info("memcached restarted...")
                        memcached_restarted = True
                        break
                except Exception:
                    self.log.error("unable to connect to {0}:{1}".format(
                        server.ip, server.port))
                    if mc:
                        mc.close()
                    time.sleep(1)
            if not memcached_restarted:
                self.fail("memcached did not start {0}:{1}".format(
                    server.ip, server.port))

        for server in self.servers:
            mc = MemcachedClientHelper.direct_client(server, "default")
            expected_curr_items_tot = map["{0}:{1}".format(
                server.ip, server.port)]["curr_items_tot"]
            now_items = 0
            start = time.time()

            if server == self.servers[0]:
                wait_time = 600
            else:
                wait_time = 60
                # Try to get the stats for 10 minutes, else hit out.
            while time.time() - start < wait_time:
                # Get the wamrup time for each server
                try:
                    stats = mc.stats()
                    if stats is not None:
                        warmup_time = int(stats["ep_warmup_time"])
                        self.log.info("ep_warmup_time is %s " % warmup_time)
                        self.log.info(
                            "Collected the stats {0} for server {1}:{2}".
                            format(stats["ep_warmup_time"], server.ip,
                                   server.port))
                        break
                    else:
                        self.log.info(
                            " Did not get the stats from the server yet, trying again....."
                        )
                        time.sleep(2)
                except Exception as e:
                    self.log.error(
                        "Could not get warmup_time stats from server {0}:{1}, exception {2}"
                        .format(server.ip, server.port, e))

            else:
                self.fail(
                    "Fail! Unable to get the warmup-stats from server {0}:{1} after trying for {2} seconds."
                    .format(server.ip, server.port, wait_time))

            # Verify the item count from each server, if you get repeated same count(< expected count) for over
            # 3 minutes, then fail. Try to get the items from the server for 30 mins in total, else fail
            start = time.time()
            while time.time() - start < 1800:
                time.sleep(2)
                if mc.stats()["curr_items_tot"] < expected_curr_items_tot:
                    self.log.info(
                        "still warming up .... curr_items_tot : {0}".format(
                            mc.stats()["curr_items_tot"]))
                    while now_items == mc.stats()["curr_items_tot"]:
                        if time.time() - start <= 180:
                            self.log.info(
                                "still warming up .... curr_items_tot : {0}".
                                format(mc.stats()["curr_items_tot"]))
                        else:
                            self.fail(
                                "Getting repetitive data, exiting from this server"
                            )
                else:
                    self.log.info(
                        "warmup completed, awesome!!! Warmed up. {0} items ".
                        format(mc.stats()["curr_items_tot"]))
                    break
                now_items = mc.stats()["curr_items_tot"]
            mc.close()