Ejemplo n.º 1
0
    def stop(self, **kwargs):
        """
        Stop background probe

        NB! Do not move fut.results vs jmx node killing. This may lead to py4j errors in log.
        """
        super().stop()

        # write 1 into lock file to stop all probes
        with open('lock', 'w') as f:
            f.write('1')

        # get result from process threads (this code guarantee that jmx is not usable anymore)
        sleep(10)

        for name, fut in self.futures.items():
            if fut.exception():
                log_print("Failed to evaluate probe %s, %s" %
                          (name, fut.exception()),
                          color='red')
            else:
                self.results[self.version][name] = fut.result()

        # kill utility using ignite api (jmx gateway already down in probe thread)
        if self.jmx:
            self.jmx.kill_utility()
            # self.ignite.kill_node(self.jmx.node_id)

        # shutdown executor
        self.executor.shutdown(wait=True)

        # remove lock file
        os.remove('lock')
Ejemplo n.º 2
0
 def check_single_artifact(self):
     if len(self.results.keys()) < 2:
         log_print('There is 0 or 1 result for time probe')
         self.result_passed = True
         return True
     else:
         return False
    def stop(self, **kwargs):
        """
        Store stop time work/db directory size
        """
        if 'speed' not in kwargs:
            log_print(
                'Unable to get time specific results from test: no such property "speed"',
                color='red')
            return

        super().stop()

        self.results[self.version] += kwargs.get('speed')
    def is_passed(self, **kwargs):
        """
        Formula:
        base_version.total_time - observable_version.total_time / base_version.total_time

        I.e. difference between  base and observable version in percentage

        :return is verification passed
        """
        if self.check_single_artifact():
            return True

        is_passed = True

        for version in self.results:
            if version == self.base_version:
                continue

            base_version_result = avg(self.results[self.base_version])
            comparable_version_result = avg(self.results[version])

            time_diff = percentage_diff(base_version_result,
                                        comparable_version_result)
            min_value = float(self.probe_config.get('diff')[0])
            max_value = float(self.probe_config.get('diff')[1])

            is_passed = min_value < time_diff < max_value

            self.result_message = 'Time run difference: {:.2f}, {} (base) - {} sec, {} - {} sec'.format(
                time_diff,
                self.base_version,
                int(base_version_result / 1000) if not self.seconds else
                base_version_result,  # milliseconds to seconds
                version,
                int(comparable_version_result /
                    1000) if not self.seconds else comparable_version_result
                # milliseconds to seconds
            )

            log_print(self.result_message,
                      color=self.get_out_color(is_passed, **kwargs))

        self.result_passed = is_passed

        return is_passed
Ejemplo n.º 5
0
    def is_passed(self, **kwargs):
        """
        Formula:
        base_version.end_size - observable_version.end_size / base_version.end_size

        I.e. difference between  base and observable version in percentage

        :return is verification passed
        """
        if self.check_single_artifact():
            return True

        is_passed = False

        for version in self.results.keys():
            if version == self.base_version:
                continue

            base_result = avg(self.results[self.base_version])
            comparable_result = avg(self.results[self.base_version])

            avg_snapshot_size_diff = percentage_diff(base_result,
                                                     comparable_result)

            min_value = float(self.probe_config.get('diff')[0])
            max_value = float(self.probe_config.get('diff')[1])

            is_passed = min_value < avg_snapshot_size_diff < max_value

            self.result_message = \
                'Average snapshot directory size difference: {:.2f}. {} (base) - {}, {} - {}'.format(
                    avg_snapshot_size_diff,
                    self.base_version,
                    convert_size(base_result),
                    version,
                    convert_size(comparable_result)
                )

            log_print(self.result_message,
                      color=self.get_out_color(is_passed, **kwargs))

        self.result_passed = is_passed

        return is_passed
Ejemplo n.º 6
0
    def is_passed(self, **kwargs):
        """
        Formula:
        base_version.end_size - observable_version.end_size / base_version.end_size

        I.e. difference between  base and observable version in percentage

        :return is verification passed
        """
        if self.check_single_artifact():
            return True

        is_passed = True

        for version in self.results.keys():
            if version == self.base_version:
                continue

            db_total_size = percentage_diff(
                self.results[self.base_version]['end'],
                self.results[version]['end'])

            min_value = float(self.probe_config.get('diff')[0])
            max_value = float(self.probe_config.get('diff')[1])

            is_passed = min_value < db_total_size < max_value

            self.result_message = \
                'Sub of /db folder size difference: {:.2f}. {} (base) - {}, {} - {}' \
                    .format(db_total_size,
                            self.base_version,
                            convert_size(self.results[self.base_version]['end']),
                            version,
                            convert_size(self.results[version]['end']))

            log_print(self.result_message,
                      color=self.get_out_color(is_passed, **kwargs))

        self.result_passed = is_passed

        return is_passed
    def is_passed(self, **kwargs):
        """
        Formula:
        observable_version.total_time - base_version.total_time / base_version.total_time

        I.e. difference between  base and observable version in percentage

        :return is verification passed
        """
        if self.check_single_artifact():
            return True

        is_passed = True
        for version in self.results:
            if version == self.base_version:
                continue

            base_version_result = avg(self.results[self.base_version])
            comparable_version_result = avg(self.results[version])

            # rotate difference - in this case bigger is better
            speed_diff = percentage_diff(base_version_result,
                                         comparable_version_result)
            min_value = float(self.probe_config.get('diff')[0])
            max_value = float(self.probe_config.get('diff')[1])

            is_passed = min_value < speed_diff < max_value

            self.result_message = 'Rebalance speed difference: {:.2f}, {} (base) - {}/sec, {} - {}/sec'.format(
                speed_diff, self.base_version,
                convert_size(base_version_result, start_byte=True), version,
                convert_size(comparable_version_result, start_byte=True))

            log_print(self.result_message,
                      color=self.get_out_color(is_passed, **kwargs))

        self.result_passed = is_passed

        return is_passed
Ejemplo n.º 8
0
    def jmx_thread_method(nodes_to_monitor,
                          ignite_nodes,
                          jmx_node_id,
                          metrics_to_collect,
                          timeout=5):
        """
        probe thread that collects JMX metrics from specified nodes

        Uses mocked JMXUtility (does not start new instance, just use existing methods)
        We need to pass nothing to create this instance just override nodes, gateway and service

        :param nodes_to_monitor: nodes that we want to monitor (server nodes in this example)
        :param ignite_nodes: nodes from current Ignite app (need to get PID)
        :param jmx_node_id: jmx node id from tiden.ignite.nodes
        :param metrics_to_collect: {'attr': {'grp': 'Group', 'bean': 'Bean', 'attribute': 'Attr'}, ...}
        :param timeout: timeout to collect metrics
        :return: collected results ('default' python type)
        """
        # Close connections and shutdown gateway properly
        jmx_metric = {}

        jmx = None
        try:
            jmx = JmxUtility()
            jmx.initialize_manually(jmx_node_id, ignite_nodes)

            with open('lock', 'r') as f:
                while True:
                    if f.read(1) == '1':
                        log_print("Background probe JMX has been interrupted")
                        break

                    current_time = get_current_time()
                    for node_idx in nodes_to_monitor:
                        if current_time not in jmx_metric:
                            jmx_metric[current_time] = {}

                        if node_idx not in jmx_metric[current_time]:
                            jmx_metric[current_time][node_idx] = {}

                        for name, metric in metrics_to_collect.items():
                            try:
                                string_value = \
                                    jmx.get_attributes(node_idx,
                                                       metric['grp'],
                                                       metric['bean'],
                                                       metric['attribute'],
                                                       )[metric['attribute']]

                                if metric['type'] == 'int':
                                    jmx_metric[current_time][node_idx][
                                        name] = int(string_value)
                                else:
                                    jmx_metric[current_time][node_idx][
                                        name] = string_value
                            except Exception:
                                jmx_metric[current_time][node_idx][name] = None

                        sleep(timeout)
        except Exception:
            log_print(traceback.format_exc())
        finally:
            # Close connections and shutdown gateway properly
            if jmx:
                jmx.kill_manually()

        return jmx_metric
Ejemplo n.º 9
0
    def heap_thread_method(nodes_to_monitor,
                           ignite_nodes,
                           ssh_config,
                           timeout=5):
        """
        probe thread that collects JVM Heap usage from nodes_to_monitor

        Command to collect: "jcmd PID GC.class_histogram"
        This command prints following text:

        "PID:
        1. JAVA_OBJECT_NUM JAVA_OBJECT_SIZE JAVA_OBJECT_NAME
        ...
        N.
        Total TOTAL_OBJECTS TOTAL_OBJECTS_SIZE"

        So we need to collect PID (to match it to node) and TOTAL_OBJECTS_SIZE from that output.

        :param nodes_to_monitor: nodes that we want to monitor (server nodes in this example)
        :param ignite_nodes: nodes from current Ignite app (need to get PID)
        :param ssh_config: config['ssh_config'] from tiden config (Need to initialize SshPool)
        :return: collected results ('default' python type)
        """
        ssh = SshPool(ssh_config)
        ssh.connect()

        heap_result = {}

        try:
            with open('lock', 'r') as f:
                while True:
                    if f.read(1) == '1':
                        log_print("Background probe HEAP has been interrupted")
                        break

                    commands = {}
                    node_ids_to_pid = {}

                    for node_ids in nodes_to_monitor:
                        node_ids_to_pid[node_ids] = ignite_nodes[node_ids][
                            'PID']

                    for node_idx in nodes_to_monitor:
                        host = ignite_nodes[node_idx]['host']
                        if commands.get(host) is None:
                            commands[host] = [
                                'jcmd %s GC.class_histogram' %
                                ignite_nodes[node_idx]['PID']
                            ]
                        else:
                            commands[host].append(
                                'jcmd %s GC.class_histogram' %
                                ignite_nodes[node_idx]['PID'])

                    results = ssh.exec(commands)

                    results_parsed = {}
                    for host in results.keys():
                        result = results[host][0]

                        findall = re.compile(
                            '(\d+):\n|Total\s+\d+\s+(\d+)').findall(result)

                        # findall will return 2d array: [['PID', ''], [''] ['TOTAL_HEAP_USAGE']]
                        # todo maybe there is a better way to get this
                        if findall:
                            node_id = 0
                            for node_id, pid in node_ids_to_pid.items():
                                if pid == int(findall[0][0]):
                                    node_id = node_id
                                    break

                            try:
                                results_parsed[node_id] = (int(findall[1][1]))
                            except Exception:
                                results_parsed[node_id] = 0
                        else:
                            continue

                    heap_result[get_current_time()] = results_parsed

                    sleep(timeout)
        except Exception:
            log_print(traceback.format_exc())

        return heap_result
Ejemplo n.º 10
0
    def cpu_mem_thread_method(nodes_to_monitor,
                              ignite_nodes,
                              ssh_config,
                              timeout=5):
        """
        probe thread that collects cpu,mem from nodes_to_monitor

        Command to collect: "ps -p PID -o pid,%%cpu,%%mem"

        :param nodes_to_monitor: nodes that we want to monitor (server nodes in this example)
        :param ignite_nodes: nodes from current Ignite app (need to get PID)
        :param ssh_config: config['ssh_config'] from tiden config (Need to initialize SshPool)
        :param timeout: timeout between data collect
        :return: collected results ('default' python type)
        """
        ssh = SshPool(ssh_config)
        ssh.connect()

        cpu_mem_result = {}

        with open('lock', 'r') as f:
            while True:
                if f.read(1) == '1':
                    log_print("Background probe CPU has been interrupted")
                    break

                commands = {}
                node_ids_to_pid = {}

                for node_ids in nodes_to_monitor:
                    node_ids_to_pid[node_ids] = ignite_nodes[node_ids]['PID']

                for node_idx in nodes_to_monitor:
                    host = ignite_nodes[node_idx]['host']
                    if commands.get(host) is None:
                        commands[host] = [
                            'ps -p %s -o pid,%%cpu,%%mem' %
                            ignite_nodes[node_idx]['PID']
                        ]
                    else:
                        commands[host].append('ps -p %s -o pid,%%cpu,%%mem' %
                                              ignite_nodes[node_idx]['PID'])

                results = ssh.exec(commands)

                results_parsed = {}
                for host in results.keys():
                    result = results[host][0]

                    search = re.search('(\d+)\s+?(\d+.?\d?)\s+?(\d+.?\d?)',
                                       result)

                    if search:
                        node_id = 0
                        for node_id, pid in node_ids_to_pid.items():
                            if pid == int(search.group(1)):
                                node_id = node_id
                                break

                        results_parsed[node_id] = (float(search.group(2)),
                                                   float(search.group(3)))
                    else:
                        continue

                cpu_mem_result[get_current_time()] = results_parsed

                sleep(timeout)

        return cpu_mem_result
Ejemplo n.º 11
0
 def print_results(self):
     log_print('Results for db size probe: {}'.format(self.results))
Ejemplo n.º 12
0
 def print_results(self):
     log_print('Results for checkpointLockHoldTime probe: {}'.format(
         self.results))
Ejemplo n.º 13
0
    def print_pre_259_start_times(self):
        stopped_in_cp = {}

        for node, logs_content in self.collect_specific_text_from_logs(
                '.*Ignite node stopped in the middle of checkpoint.').items():
            stopped_in_cp[
                node] = 'Ignite node stopped in the middle of checkpoint.' in logs_content
        binary_recovery_time = self.collect_events(
            '.*performed in (\d+) ms',
            self.collect_specific_text_from_logs(
                'Binary recovery performed in'))
        part_states = self.collect_events(
            '.*partitionsProcessed=(\d+), time=(\d+)ms]',
            self.collect_specific_text_from_logs(
                'Finished restoring partition state for local groups'))
        wal_changes = self.collect_events(
            '.*updatesApplied=(\d+), time=(\d+) ms',
            self.collect_specific_text_from_logs(
                'Finished applying WAL changes'))
        memory_changes = self.collect_events(
            '.*changesApplied=(\d+), time=(\d+) ms',
            self.collect_specific_text_from_logs(
                'Finished applying memory changes'))
        log_print(stopped_in_cp, color='yellow')
        log_print(binary_recovery_time, color='yellow')
        log_print(part_states, color='yellow')
        log_print(wal_changes, color='yellow')
        log_print(memory_changes, color='yellow')

        for node in stopped_in_cp.keys():
            log_print("Report for node %s %s" %
                      (node, '(coordinator)' if node == 1 else ''),
                      color='green')

            if stopped_in_cp[node]:
                log_print("Stopped in the middle of checkpoint", color='green')
            else:
                log_print("Stopped gracefully", color='green')

            log_print("Binary was restored in %s ms" %
                      binary_recovery_time[node][0],
                      color='green')

            part_stated_time = part_states[node][1]
            part_stated_speed = (
                part_states[node][1] /
                part_states[node][0]) if part_states[node][0] != 0 else 0
            log_print(
                "Partitions recovery time = %s ms, (time / partitions) = %.2f"
                % (part_stated_time, part_stated_speed),
                color='green')

            wal_changes_time = wal_changes[node][1]
            wal_changes_speed = wal_changes[node][0]
            log_print("Applying WAL changes time = %s ms, updates = %s" %
                      (wal_changes_time, wal_changes_speed),
                      color='green')

            if node in memory_changes:
                memory_changes_time = memory_changes[node][1]
                memory_changes_speed = (memory_changes[node][1] / memory_changes[node][0]) \
                    if memory_changes[node][0] != 0 else 0
                log_print(
                    "Applying memory changes time = %s ms, (time / changes) = %.2f"
                    % (memory_changes_time, memory_changes_speed),
                    color='green')
            else:
                log_print("There is no memory changes log message found",
                          color='green')