Esempio n. 1
0
 def reset_all_app_stats(self, name):
     """ Reset all the stats for an application
     """
     l.info("Attempting to reset client stats for %s...", name)
     assert(name in self.apps)
     for task_id, info in self.apps[name]['ip_port_map'].items():
         port = info[0]
         ip = info[1]
         ha_sub = HAnalyser(ip, port, task_id)
         # Signal it to reset all client stats
         ha_sub.reset_stats()
         ha_sub.stop()  # closes the ANalyser socket, can not be used anymore
Esempio n. 2
0
 def fetch_app_stats(self, name, group_name=""):
     """
     Fetch stats from all the instances of the
     app and store it locally.
     The stats collection is done while looking at "msg_cnt"
     so it's mandatory that all the stats are required to have a field msg_cnt
     while collecting the msg_cnt is monitored, and stats collection is completed
     when the msg_cnt stops increasing between two successive reads.
     @args:
     name:         Name of the app
     group_name:   Group name if only group singal required (optional)
     """
     assert(name in self.apps)
     task_list = self.all_task_ids[name]
     if group_name:
         assert(group_name in self.app_group)
         task_list = self.app_group[group_name]
         l.info("Attempting to fetch client group stats for app[%s], group[%s]...", name, group_name)
     else:
         l.info("Attempting to fetch client stats for app[%s]...", name)
     self.apps[name]['stats'] = {}
     first_itr = True
     no_delay_needed_count = 0
     for task_id in task_list:
         info = self.apps[name]['ip_port_map'][task_id]
         port = info[0]
         ip = info[1]
         ha_sub = HAnalyser(ip, port, task_id)
         # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data
         stats = ha_sub.get_stats()
         while first_itr:
             time.sleep(.1)
             stats2 = ha_sub.get_stats()
             #  if it's the first read make sure that the sub has stopped receiving data
             if (stats['msg_cnt'] == stats2['msg_cnt']):
                 # first_itr = False
                 no_delay_needed_count += 1
                 if (no_delay_needed_count > 40):
                     # No more delays if 100 successive read's where
                     # stable on msg_cnt
                     first_itr = False
                 break
             no_delay_needed_count = 0
             stats = stats2
         ha_sub.stop()  # closes the ANalyser socket, can not be used anymore
         stats['task_id'] = task_id
         self.apps[name]['stats'][str(ip) + ':' + str(port)] = stats  # copy.deepcopy(stats)
Esempio n. 3
0
 def reset_all_app_stats(self, name, group_name=""):
     """
     Reset all the stats for an application
     @args:
     name:         Name of the app
     group_name:   Group name if only group singal required (optional)
     """
     assert (name in self.apps)
     task_list = self.all_task_ids[name]
     if group_name:
         assert (group_name in self.app_group)
         task_list = self.app_group[group_name].tasks_list
         l.info(
             "Attempting to reset client group stats for app[%s], group[%s]...",
             name, group_name)
     else:
         l.info("Attempting to reset client stats for app[%s]...", name)
     for task_id in task_list:
         info = self.apps[name]['ip_port_map'][task_id]
         port = info[0]
         ip = info[1]
         ha_sub = HAnalyser(ip, port, task_id)
         # Signal it to reset all client stats
         ha_sub.reset_stats()
         ha_sub.stop(
         )  # closes the ANalyser socket, can not be used anymore
Esempio n. 4
0
    def rerun_test(self, options):
        self.options = options
        self.reset_all_app_stats(self.stress_client)
        # Signal message sending
        l.info("Sending signal to Cassandra Stress client to start sending all messages..")
        # Force start-time for ALL clients +60 seconds from current time
        start_time = datetime.now() + timedelta(seconds=60)
        l.debug("Current Time: %s, Start Time: %s" % (datetime.now(), start_time))
        task_list = self.all_task_ids[self.stress_client]
        ha_list = []
        for task_id in task_list:
            info = self.apps[self.stress_client]['ip_port_map'][task_id]
            port = info[0]
            ip = info[1]
            ha_stress = HAnalyser(ip, port, task_id)
            # Signal ALL clients to start sending data, blocks until clients respond with "DONE" after sending all data
            ha_stress.start_test(start_time=start_time)
            ha_list.append(ha_stress)
        l.info('Waiting for test(s) to end...')
        if self.options.sim_failure:
            l.debug("Simulate Cassandra Node Failure. Init.")
            # Thread Event to indicate tests have been completed
            tests_completed = threading.Event()
            # Launch parallel Thread to simulate cassandra node failure.
            l.debug("Launch separate thread to simulate node failure and rejoin.")
            failure_thread = threading.Thread(target=simulate_node_failure, args=(self.options.cluster_ips.split(','),
                                                                 self.options.test_duration, tests_completed))
            failure_thread.start()
        for idx, ha_stress in enumerate(ha_list):
            l.debug('Waiting for task [%s] in [%s:%s] test to END. Iteration: %s' % (ha_stress.task_id, ha_stress.server_ip, ha_stress.port, idx))
            ha_stress.wait_for_testend()
        if self.options.sim_failure:
            l.debug("ALL tests are COMPLETED.")
            tests_completed.set()
        l.info('Fetch App Stats')
        self.fetch_app_stats(self.stress_client)

        return self.result_parser()
Esempio n. 5
0
 def reset_all_app_stats(self, name, group_name=""):
     """
     Reset all the stats for an application
     @args:
     name:         Name of the app
     group_name:   Group name if only group singal required (optional)
     """
     assert(name in self.apps)
     task_list = self.all_task_ids[name]
     if group_name:
         assert(group_name in self.app_group)
         task_list = self.app_group[group_name]
         l.info("Attempting to reset client group stats for app[%s], group[%s]...", name, group_name)
     else:
         l.info("Attempting to reset client stats for app[%s]...", name)
     for task_id in task_list:
         info = self.apps[name]['ip_port_map'][task_id]
         port = info[0]
         ip = info[1]
         ha_sub = HAnalyser(ip, port, task_id)
         # Signal it to reset all client stats
         ha_sub.reset_stats()
         ha_sub.stop()  # closes the ANalyser socket, can not be used anymore
Esempio n. 6
0
 def fetch_app_stats(self, name):
     """ Fetch stats from all the instances of the
     app and store it locally.
     The stats collection is done while looking at "msg_cnt"
     so it's mandatory that all the stats are required to have a field msg_cnt
     while collecting the msg_cnt is monitored, and stats collection is completed
     when the msg_cnt stops increasing between two successive reads.
     """
     assert(name in self.apps)
     ipm = self.apps[name]['ip_port_map']
     self.apps[name]['stats'] = {}
     first_itr = True
     no_delay_needed_count = 0
     for task_id, info in ipm.items():
         port = info[0]
         ip = info[1]
         ha_sub = HAnalyser(ip, port, task_id)
         # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data
         stats = ha_sub.get_stats()
         while first_itr:
             time.sleep(.1)
             stats2 = ha_sub.get_stats()
             #  if it's the first read make sure that the sub has stopped receiving data
             if (stats['msg_cnt'] == stats2['msg_cnt']):
                 # first_itr = False
                 no_delay_needed_count += 1
                 if (no_delay_needed_count > 40):
                     # No more delays if 100 successive read's where
                     # stable on msg_cnt
                     first_itr = False
                 break
             no_delay_needed_count = 0
             stats = stats2
         ha_sub.stop()  # closes the ANalyser socket, can not be used anymore
         stats['task_id'] = task_id
         self.apps[name]['stats'][str(ip) + ':' + str(port)] = stats  # copy.deepcopy(stats)
Esempio n. 7
0
    def remove_unresponsive_tasks(self, name, group_name=""):
        """
        Ping all the application task's and if any of they don't respond to
        ping remove them from active task list.
        @args:
        name:         Name of the app
        group_name:   Group name if only group singal required (optional)
        """
        assert(name in self.apps)
        task_list = self.all_task_ids[name]
        if group_name:
            assert(group_name in self.app_group)
            task_list = self.app_group[group_name].tasks_list
            l.debug('Pinging group instances of app[%s], group[%s] to make sure they are started....', name, group_name)
        else:
            l.debug('Pinging instances of app[%s] to make sure they are started....', name)
        cnt = 0
        remove_list = []
        for task_id in task_list:
            info = self.apps[name]['ip_port_map'][task_id]
            port = info[0]
            ip = info[1]
            ha = HAnalyser(ip, port, task_id)
            # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data
            res = ha.do_ping()
            if not res:
                l.info("Ping failed to [%s] %s:%s. removing from client list" % (task_id, ip, port))
                remove_list.append(task_id)
                ha.stop()
            cnt += res
            ha.stop()  # closes the Analyser socket, can not be used anymore
        l.info('Done pinging all the clients. Got pong response from %d out of %d' %
               (cnt, len(self.apps[name]['ip_port_map'].items())))

        temp_dict = {}
        for g_name in self.app_group.keys():
            temp_dict[g_name] = []
        for item in remove_list:
            l.info("Removing client [%s]" % (item))
            del self.apps[name]['ip_port_map'][item]
            self.all_task_ids[name].remove(item)
            for g_name, g_obj in self.app_group.items():
                g_list = g_obj.tasks_list
                l.debug("Checking if bad client[%s] is in group[%s]", item, g_name)
                l.debug(g_list)
                if item in g_list:
                    l.info("Appending [%s] in group [%s]", item, g_name)
                    temp_dict[g_name].append(item)
        l.info(temp_dict)
        for g_name, bad_list in temp_dict.items():
            for bad_client in bad_list:
                l.info("Removing client [%s] from group [%s]", bad_client, g_name)
                self.app_group[g_name].tasks_list.remove(bad_client)
Esempio n. 8
0
 def fetch_app_stats(self, name, group_name=""):
     """
     Fetch stats from all the instances of the
     app and store it locally.
     The stats collection is done while looking at "msg_cnt"
     so it's mandatory that all the stats are required to have a field msg_cnt
     while collecting the msg_cnt is monitored, and stats collection is completed
     when the msg_cnt stops increasing between two successive reads.
     @args:
     name:         Name of the app
     group_name:   Group name if only group singal required (optional)
     """
     assert (name in self.apps)
     task_list = self.all_task_ids[name]
     if group_name:
         assert (group_name in self.app_group)
         task_list = self.app_group[group_name].tasks_list
         l.info(
             "Attempting to fetch client group stats for app[%s], group[%s]...",
             name, group_name)
     else:
         l.info("Attempting to fetch client stats for app[%s]...", name)
     self.apps[name]['stats'] = {}
     first_itr = True
     no_delay_needed_count = 0
     for task_id in task_list:
         info = self.apps[name]['ip_port_map'][task_id]
         port = info[0]
         ip = info[1]
         ha_sub = HAnalyser(ip, port, task_id)
         # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data
         stats = ha_sub.get_stats()
         while first_itr:
             time.sleep(.1)
             stats2 = ha_sub.get_stats()
             #  if it's the first read make sure that the sub has stopped receiving data
             if (stats['msg_cnt'] == stats2['msg_cnt']):
                 # first_itr = False
                 no_delay_needed_count += 1
                 if (no_delay_needed_count > 40):
                     # No more delays if 100 successive read's where
                     # stable on msg_cnt
                     first_itr = False
                 break
             no_delay_needed_count = 0
             stats = stats2
         ha_sub.stop(
         )  # closes the ANalyser socket, can not be used anymore
         stats['task_id'] = task_id
         self.apps[name]['stats'][str(ip) + ':' +
                                  str(port)] = stats  # copy.deepcopy(stats)
Esempio n. 9
0
    def rerun_test(self, options):
        self.options = options
        self.reset_all_app_stats(self.stress_client)
        # Signal message sending
        l.info(
            "Sending signal to Cassandra Stress client to start sending all messages.."
        )
        # Force start-time for ALL clients +60 seconds from current time
        start_time = datetime.now() + timedelta(seconds=60)
        l.debug("Current Time: %s, Start Time: %s" %
                (datetime.now(), start_time))
        task_list = self.all_task_ids[self.stress_client]
        ha_list = []
        for task_id in task_list:
            info = self.apps[self.stress_client]['ip_port_map'][task_id]
            port = info[0]
            ip = info[1]
            ha_stress = HAnalyser(ip, port, task_id)
            # Signal ALL clients to start sending data, blocks until clients respond with "DONE" after sending all data
            ha_stress.start_test(start_time=start_time)
            ha_list.append(ha_stress)
        l.info('Waiting for test(s) to end...')
        if self.options.sim_failure:
            l.debug("Simulate Cassandra Node Failure. Init.")
            # Thread Event to indicate tests have been completed
            tests_completed = threading.Event()
            # Launch parallel Thread to simulate cassandra node failure.
            l.debug(
                "Launch separate thread to simulate node failure and rejoin.")
            failure_thread = threading.Thread(
                target=simulate_node_failure,
                args=(self.options.cluster_ips.split(','),
                      self.options.test_duration, tests_completed))
            failure_thread.start()
        for idx, ha_stress in enumerate(ha_list):
            l.debug(
                'Waiting for task [%s] in [%s:%s] test to END. Iteration: %s' %
                (ha_stress.task_id, ha_stress.server_ip, ha_stress.port, idx))
            ha_stress.wait_for_testend()
        if self.options.sim_failure:
            l.debug("ALL tests are COMPLETED.")
            tests_completed.set()
        l.info('Fetch App Stats')
        self.fetch_app_stats(self.stress_client)

        return self.result_parser()
Esempio n. 10
0
 def ping_all_app_inst(self, name):
     """ Ping all the application task's and if any of they don't respond to
     ping remove them from active task list.
     """
     l.info('Pinging all the instances of %s to make sure they are started....', name)
     cnt = 0
     remove_list = []
     for task_id, info in self.apps[name]['ip_port_map'].items():
         port = info[0]
         ip = info[1]
         ha = HAnalyser(ip, port, task_id)
         # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data
         res = ha.do_ping()
         if not res:
             l.info("Ping failed to [%s] %s:%s. removing from client list" % (task_id, ip, port))
             remove_list.append(task_id)
             ha.stop()
         cnt += res
         ha.stop()  # closes the Analyser socket, can not be used anymore
     l.info('Done pinging all the clients. Got pong response from %d out of %d' %
            (cnt, len(self.apps[name]['ip_port_map'].items())))
     for item in remove_list:
         del self.apps[name]['ip_port_map'][item]
Esempio n. 11
0
    def remove_unresponsive_tasks(self, name, group_name=""):
        """
        Ping all the application task's and if any of they don't respond to
        ping remove them from active task list.
        @args:
        name:         Name of the app
        group_name:   Group name if only group singal required (optional)
        """
        assert (name in self.apps)
        task_list = self.all_task_ids[name]
        if group_name:
            assert (group_name in self.app_group)
            task_list = self.app_group[group_name].tasks_list
            l.debug(
                'Pinging group instances of app[%s], group[%s] to make sure they are started....',
                name, group_name)
        else:
            l.debug(
                'Pinging instances of app[%s] to make sure they are started....',
                name)
        cnt = 0
        remove_list = []
        for task_id in task_list:
            info = self.apps[name]['ip_port_map'][task_id]
            port = info[0]
            ip = info[1]
            ha = HAnalyser(ip, port, task_id)
            # Signal it to start sending data, blocks until PUB responsds with "DONE" after sending all data
            res = ha.do_ping()
            if not res:
                l.info("Ping failed to [%s] %s:%s. removing from client list" %
                       (task_id, ip, port))
                remove_list.append(task_id)
                ha.stop()
            cnt += res
            ha.stop()  # closes the Analyser socket, can not be used anymore
        l.info(
            'Done pinging all the clients. Got pong response from %d out of %d'
            % (cnt, len(self.apps[name]['ip_port_map'].items())))

        temp_dict = {}
        for g_name in self.app_group.keys():
            temp_dict[g_name] = []
        for item in remove_list:
            l.info("Removing client [%s]" % (item))
            del self.apps[name]['ip_port_map'][item]
            self.all_task_ids[name].remove(item)
            for g_name, g_obj in self.app_group.items():
                g_list = g_obj.tasks_list
                l.debug("Checking if bad client[%s] is in group[%s]", item,
                        g_name)
                l.debug(g_list)
                if item in g_list:
                    l.info("Appending [%s] in group [%s]", item, g_name)
                    temp_dict[g_name].append(item)
        l.info(temp_dict)
        for g_name, bad_list in temp_dict.items():
            for bad_client in bad_list:
                l.info("Removing client [%s] from group [%s]", bad_client,
                       g_name)
                self.app_group[g_name].tasks_list.remove(bad_client)
Esempio n. 12
0
    def rerun_test(self, options):
        self.set_options(options)
        self.boundary_setup(self.options, 'msg_rate', self.boundary_resultfn)
        # self.test_duration = options.test_duration
        # self.msg_batch = options.msg_batch
        # self.msg_rate = options.msg_rate
        l.info("Updating test metrics: test_duration=%s, msg_batch=%s, msg_rate=%s",
               self.options.test_duration, self.options.msg_batch, self.options.msg_rate)

        # Update the PUB server with new metrics
        self.ha_pub.update_config(test_duration=self.options.test_duration,
                                  msg_batch=self.options.msg_batch,
                                  msg_requested_rate=self.options.msg_rate)
        l.info("PUB server updated")

        self.reset_all_app_stats(self.zstsub)
        # Select which sub's are going to be slow
        # and send them there rate.
        # add the properties to the sub app data structure on their rate.
        acnt = self.get_app_instcnt(self.zstsub)
        slow_num = int(acnt * options.slow_clients_percent / 100)
        update_sub_config = False
        if slow_num:
            slow_clients = self.get_app_property(self.zstsub, 'slow_clients')
            if not slow_clients or int(slow_num) != len(slow_clients):
                # reset all the clients
                self.set_app_property(self.zstsub, 'slow_clients',
                                      Set(self.random_select_instances(self.zstsub, slow_num)))
                update_sub_config = True
        rec_num = int(acnt * options.rec_clients_percent / 100)
        if rec_num:
            rec_clients = self.get_app_property(self.zstsub, 'reconnecting_clients')
            if not rec_clients or rec_num != len(rec_clients):
                self.set_app_property(self.zstsub, 'reconnecting_clients',
                                      Set(self.random_select_instances(self.zstsub, rec_num)))
                update_sub_config = True
        if update_sub_config:
            # Now update all the slow clients
            ipm = self.get_app_ipport_map(self.zstsub)
            slow_set = self.get_app_property(self.zstsub, 'slow_clients')
            rec_set = self.get_app_property(self.zstsub, 'reconnecting_clients')
            for key in ipm.keys():
                ip = ipm[key][1]
                port = ipm[key][0]
                ha = HAnalyser(ip, port, key)
                recv_rate = 0
                reconnect_rate = 0
                if slow_set and key in slow_set:
                    print("Task ID " + key + " Is going to be slow")
                    recv_rate = options.slow_clients_rate
                if rec_set and key in rec_set:
                    print("Task ID " + key + " Is going to be reconnecting")
                    reconnect_rate = options.rec_clients_rate
                ha.update_config(recv_rate=recv_rate, reconnect_rate=reconnect_rate)
                ha.stop()

        # Signal message sending
        l.info("Sending signal to PUB to start sending all messages..")
        self.ha_pub.start_test()
        self.ha_pub.wait_for_testend()
        self.fetch_app_stats(self.zstpub)
        assert(len(self.apps[self.zstpub]['stats']) == 1)
        pub_data = self.apps[self.zstpub]['stats'].values()[0]
        l.info("Publisher send %d packets at the rate of %d pps" % (pub_data['msg_cnt'],
                                                                    pub_data['rate']))

        # Fetch all sub client data
        self.fetch_app_stats(self.zstsub)

        return self.result_parser()
Esempio n. 13
0
 def __init__(self, server_ip, server_port):
     HAnalyser.__init__(self, server_ip, server_port)
Esempio n. 14
0
 def __init__(self, server_ip, server_port, task_id):
     HAnalyser.__init__(self, server_ip, server_port, task_id)
Esempio n. 15
0
 def __init__(self, server_ip, server_port):
     HAnalyser.__init__(self, server_ip, server_port)
Esempio n. 16
0
 def __init__(self, server_ip, server_port, task_id):
     HAnalyser.__init__(self, server_ip, server_port, task_id)
Esempio n. 17
0
    def rerun_test(self, options):
        self.set_options(options)
        self.boundary_setup(self.options, 'msg_rate', self.boundary_resultfn)
        # self.test_duration = options.test_duration
        # self.msg_batch = options.msg_batch
        # self.msg_rate = options.msg_rate
        l.info("Updating test metrics: test_duration=%s, msg_batch=%s, msg_rate=%s",
               self.options.test_duration, self.options.msg_batch, self.options.msg_rate)

        # Update the PUB server with new metrics
        self.ha_pub.update_config(test_duration=self.options.test_duration,
                                  msg_batch=self.options.msg_batch,
                                  msg_requested_rate=self.options.msg_rate)
        l.info("PUB server updated")

        self.reset_all_app_stats(self.zstsub)
        # Select which sub's are going to be slow
        # and send them there rate.
        # add the properties to the sub app data structure on their rate.
        acnt = self.get_app_instcnt(self.zstsub)
        slow_num = int(acnt * options.slow_clients_percent / 100)
        update_sub_config = False
        if slow_num:
            slow_clients = self.get_app_property(self.zstsub, 'slow_clients')
            if not slow_clients or int(slow_num) != len(slow_clients):
                # reset all the clients
                self.set_app_property(self.zstsub, 'slow_clients',
                                      Set(self.random_select_instances(self.zstsub, slow_num)))
                update_sub_config = True
        rec_num = int(acnt * options.rec_clients_percent / 100)
        if rec_num:
            rec_clients = self.get_app_property(self.zstsub, 'reconnecting_clients')
            if not rec_clients or rec_num != len(rec_clients):
                self.set_app_property(self.zstsub, 'reconnecting_clients',
                                      Set(self.random_select_instances(self.zstsub, rec_num)))
                update_sub_config = True
        if update_sub_config:
            # Now update all the slow clients
            ipm = self.get_app_ipport_map(self.zstsub)
            slow_set = self.get_app_property(self.zstsub, 'slow_clients')
            rec_set = self.get_app_property(self.zstsub, 'reconnecting_clients')
            for key in ipm.keys():
                ip = ipm[key][1]
                port = ipm[key][0]
                ha = HAnalyser(ip, port, key)
                recv_rate = 0
                reconnect_rate = 0
                if slow_set and key in slow_set:
                    print("Task ID " + key + " Is going to be slow")
                    recv_rate = options.slow_clients_rate
                if rec_set and key in rec_set:
                    print("Task ID " + key + " Is going to be reconnecting")
                    reconnect_rate = options.rec_clients_rate
                ha.update_config(recv_rate=recv_rate, reconnect_rate=reconnect_rate)
                ha.stop()

        # Signal message sending
        l.info("Sending signal to PUB to start sending all messages..")
        self.ha_pub.start_test()
        self.ha_pub.wait_for_testend()
        self.fetch_app_stats(self.zstpub)
        assert(len(self.apps[self.zstpub]['stats']) == 1)
        pub_data = self.apps[self.zstpub]['stats'].values()[0]
        l.info("Publisher send %d packets at the rate of %d pps" % (pub_data['msg_cnt'],
                                                                    pub_data['rate']))

        # Fetch all sub client data
        self.fetch_app_stats(self.zstsub)

        return self.result_parser()