def query(self, query, start, end, scrap_metrics_step=None):
     """
     :param start: time=<rfc3339 | unix_timestamp>: Start timestamp.
     :param end: time=<rfc3339 | unix_timestamp>: End timestamp.
     :param scrap_metrics_step is the granularity of data requested from Prometheus DB
     :param query:
     :return: {
               metric: { },
               values: [[linux_timestamp1, value1], [linux_timestamp2, value2]...[linux_timestampN, valueN]]
              }
     """
     url = "http://{}:{}/api/v1/query_range?query=".format(
         normalize_ipv6_url(self.host), self.port)
     if not scrap_metrics_step:
         scrap_metrics_step = self.scylla_scrape_interval
     _query = "{url}{query}&start={start}&end={end}&step={scrap_metrics_step}".format(
         url=url,
         query=query,
         start=start,
         end=end,
         scrap_metrics_step=scrap_metrics_step)
     LOGGER.debug("Query to PrometheusDB: %s", _query)
     result = self.request(url=_query)
     if result:
         return result["data"]["result"]
     else:
         LOGGER.error("Prometheus query unsuccessful!")
         return []
 def __init__(self, host, port=9090, alternator=None):
     self.host = host
     self.port = port
     self.range_query_url = "http://{}:{}/api/v1/query_range?query=".format(
         normalize_ipv6_url(host), port)
     self.config = self.get_configuration()
     self.alternator = alternator
Beispiel #3
0
 def get_configuration(self):
     result = self.request(url="http://{}:{}/api/v1/status/config".format(normalize_ipv6_url(self.host), self.port))
     configs = yaml.safe_load(result["data"]["yaml"])
     LOGGER.debug("Parsed Prometheus configs: %s", configs)
     new_scrape_configs = {}
     for conf in configs["scrape_configs"]:
         new_scrape_configs[conf["job_name"]] = conf
     configs["scrape_configs"] = new_scrape_configs
     return configs
Beispiel #4
0
 def web_driver_docker_client(self) -> Optional[DockerClient]:
     if not self.ssh_login_info:
         return None
     SSHAgent.add_keys((self.ssh_login_info["key_file"], ))
     # since a bug in docker package https://github.com/docker-library/python/issues/517 that need to explicitly
     # pass down the port for supporting ipv6
     user = self.ssh_login_info['user']
     hostname = normalize_ipv6_url(self.ssh_login_info['hostname'])
     try:
         return DockerClient(base_url=f"ssh://{user}@{hostname}:22", timeout=DOCKER_API_CALL_TIMEOUT)
     except paramiko.ssh_exception.BadHostKeyException as exc:
         system_host_keys_path = os.path.expanduser("~/.ssh/known_hosts")
         system_host_keys = paramiko.hostkeys.HostKeys(system_host_keys_path)
         if system_host_keys.pop(exc.hostname, None):
             system_host_keys.save(system_host_keys_path)
         return DockerClient(base_url=f"ssh://{user}@{hostname}:22", timeout=DOCKER_API_CALL_TIMEOUT)
Beispiel #5
0
    def check_timeout(self):  # pylint: disable=too-many-locals
        assert self.monitors.nodes, 'Monitor node should be set, we will try to get metrics from Prometheus server'
        base_url = "http://%s:9090/api/v1/query_range" % normalize_ipv6_url(
            self.monitors.nodes[0].external_address)
        range_str = "&start={0.start}&end={0.end}".format(self)
        cmd = [
            "curl",
            "{}?query=scylla_storage_proxy_coordinator_read_timeouts{}&step=60s"
            .format(base_url, range_str)
        ]
        self.log.debug(
            'Get read timeout per minute by Prometheus API, cmd: %s', cmd)
        result = subprocess.check_output(cmd, shell=True)

        orig_data = json.loads(result)
        read_timeout_msg = 'Read timeout of whole datacenter per minute should be less than 5000'
        self.log.debug('Check if we have significant read timeout, %s',
                       read_timeout_msg)

        # parse prometheus response to generate a result matrix
        matrix = []
        for i in orig_data['data']['result']:
            shard_unit = []
            for j in i['values']:
                shard_unit.append(int(j[1]))
            matrix.append(shard_unit)

        # go through the matrix to check timeout per minute
        prev = None
        significant = []
        for time_idx in range(len(matrix[0])):
            all_timeout = 0
            for shard_unit in matrix:
                all_timeout += shard_unit[time_idx]
            if prev:
                timeout_per_min = all_timeout - prev
                self.log.debug('timeout_per_min: %s', timeout_per_min)
                if timeout_per_min > 5000:
                    significant.append(timeout_per_min)
            prev = all_timeout

        self.log.debug(significant)
        assert not significant, read_timeout_msg
Beispiel #6
0
    def test_custom_time(self):
        """
        Run cassandra-stress with params defined in data_dir/scylla.yaml
        """
        # pylint: disable=too-many-locals,too-many-branches,too-many-statements

        self.db_cluster.add_nemesis(nemesis=self.get_nemesis_class(),
                                    tester_obj=self)
        stress_queue = list()
        write_queue = list()
        verify_queue = list()

        # prepare write workload
        prepare_write_cmd = self.params.get('prepare_write_cmd', default=None)
        keyspace_num = self.params.get('keyspace_num', default=1)

        pre_create_schema = self.params.get('pre_create_schema', default=False)

        alternator_port = self.params.get('alternator_port', default=None)
        if alternator_port:
            endpoint_url = 'http://{}:{}'.format(normalize_ipv6_url(self.db_cluster.nodes[0].external_address),
                                                 alternator_port)
            alternator_create_table(endpoint_url, test_params=self.params)

        if prepare_write_cmd:
            # In some cases (like many keyspaces), we want to create the schema (all keyspaces & tables) before the load
            # starts - due to the heavy load, the schema propogation can take long time and c-s fails.
            if pre_create_schema:
                self._pre_create_schema(keyspace_num, scylla_encryption_options=self.params.get(
                    'scylla_encryption_options', None))

            # When the load is too heavy for one lader when using MULTI-KEYSPACES, the load is spreaded evenly across
            # the loaders (round_robin).
            if keyspace_num > 1 and self.params.get('round_robin'):
                self.log.debug("Using round_robin for multiple Keyspaces...")
                for i in range(1, keyspace_num + 1):
                    keyspace_name = self._get_keyspace_name(i)
                    self._run_all_stress_cmds(write_queue, params={'stress_cmd': prepare_write_cmd,
                                                                   'keyspace_name': keyspace_name,
                                                                   'round_robin': True})
            # Not using round_robin and all keyspaces will run on all loaders
            else:
                self._run_all_stress_cmds(write_queue, params={'stress_cmd': prepare_write_cmd,
                                                               'keyspace_num': keyspace_num,
                                                               'round_robin': self.params.get('round_robin')})

            # In some cases we don't want the nemesis to run during the "prepare" stage in order to be 100% sure that
            # all keys were written succesfully
            if self.params.get('nemesis_during_prepare'):
                # Wait for some data (according to the param in the yal) to be populated, for multi keyspace need to
                # pay attention to the fact it checks only on keyspace1
                self.db_cluster.wait_total_space_used_per_node(keyspace=None)
                self.db_cluster.start_nemesis()

            # Wait on the queue till all threads come back.
            # todo: we need to improve this part for some cases that threads are being killed and we don't catch it.
            for stress in write_queue:
                self.verify_stress_thread(cs_thread_pool=stress)

            # Run nodetool flush on all nodes to make sure nothing left in memory
            # I decided to comment this out for now, when we found the data corruption bug, we wanted to be on the safe
            # side, but I don't think we should continue with this approach.
            # If we decided to add this back in the future, we need to wrap it with try-except because it can run
            # in parallel to nemesis and it will fail on one of the nodes.
            # self._flush_all_nodes()

            # In case we would like to verify all keys were written successfully before we start other stress / nemesis
            prepare_verify_cmd = self.params.get('prepare_verify_cmd', default=None)
            if prepare_verify_cmd:
                self._run_all_stress_cmds(verify_queue, params={'stress_cmd': prepare_verify_cmd,
                                                                'keyspace_num': keyspace_num})

                for stress in verify_queue:
                    self.verify_stress_thread(cs_thread_pool=stress)

        # Collect data about partitions and their rows amount
        validate_partitions = self.params.get('validate_partitions', default=None)
        table_name, primary_key_column, partitions_dict_before = '', '', {}
        if validate_partitions:
            table_name = self.params.get('table_name', default=None)
            primary_key_column = self.params.get('primary_key_column', default=None)
            self.log.debug('Save partitons info before reads')
            partitions_dict_before = self.collect_partitions_info(table_name=table_name,
                                                                  primary_key_column=primary_key_column,
                                                                  save_into_file_name='partitions_rows_before.log')

        stress_cmd = self.params.get('stress_cmd', default=None)
        if stress_cmd:
            # Stress: Same as in prepare_write - allow the load to be spread across all loaders when using multi ks
            if keyspace_num > 1 and self.params.get('round_robin'):
                self.log.debug("Using round_robin for multiple Keyspaces...")
                for i in range(1, keyspace_num + 1):
                    keyspace_name = self._get_keyspace_name(i)
                    params = {'keyspace_name': keyspace_name, 'round_robin': True, 'stress_cmd': stress_cmd}

                    self._run_all_stress_cmds(stress_queue, params)

            # The old method when we run all stress_cmds for all keyspace on the same loader
            else:
                params = {'keyspace_num': keyspace_num, 'stress_cmd': stress_cmd}
                self._run_all_stress_cmds(stress_queue, params)

        customer_profiles = self.params.get('cs_user_profiles', default=[])
        if customer_profiles:
            cs_duration = self.params.get('cs_duration', default='50m')
            for cs_profile in customer_profiles:
                assert os.path.exists(cs_profile), 'File not found: {}'.format(cs_profile)
                self.log.debug('Run stress test with user profile {}, duration {}'.format(cs_profile, cs_duration))
                profile_dst = os.path.join('/tmp', os.path.basename(cs_profile))
                with open(cs_profile) as pconf:
                    cont = pconf.readlines()
                    user_profile_table_count = self.params.get(  # pylint: disable=invalid-name
                        'user_profile_table_count', default=1)
                    for i in range(user_profile_table_count):
                        for cmd in [line.lstrip('#').strip() for line in cont if line.find('cassandra-stress') > 0]:
                            stress_cmd = (cmd.format(profile_dst, cs_duration))
                            params = {'stress_cmd': stress_cmd, 'profile': cs_profile}
                            self.log.debug('Stress cmd: {}'.format(stress_cmd))
                            self._run_all_stress_cmds(stress_queue, params)

        fullscan = self._get_fullscan_params()
        if fullscan:
            self.log.info('Fullscan target: {} Fullscan interval: {}'.format(fullscan['ks.cf'],
                                                                             fullscan['interval']))
            self.run_fullscan_thread(ks_cf=fullscan['ks.cf'], interval=fullscan['interval'])

        # Check if we shall wait for total_used_space or if nemesis wasn't started
        if not prepare_write_cmd or not self.params.get('nemesis_during_prepare'):
            self.db_cluster.wait_total_space_used_per_node(keyspace=None)
            self.db_cluster.start_nemesis()

        stress_read_cmd = self.params.get('stress_read_cmd', default=None)
        if stress_read_cmd:
            params = {'keyspace_num': keyspace_num, 'stress_cmd': stress_read_cmd}
            self._run_all_stress_cmds(stress_queue, params)

        for stress in stress_queue:
            self.verify_stress_thread(cs_thread_pool=stress)

        if (stress_read_cmd or stress_cmd) and validate_partitions:
            self.log.debug('Save partitons info after reads')
            partitions_dict_after = self.collect_partitions_info(table_name=table_name,
                                                                 primary_key_column=primary_key_column,
                                                                 save_into_file_name='partitions_rows_after.log')
            self.assertEqual(partitions_dict_before, partitions_dict_after,
                             msg='Row amount in partitions is not same before and after running of nemesis')
 def create_snapshot(self):
     url = "http://{}:{}/api/v1/admin/tsdb/snapshot".format(
         normalize_ipv6_url(self.host), self.port)
     result = self.request(url, True)
     LOGGER.debug('Request result: {}'.format(result))
     return result
Beispiel #8
0
 def create_endpoint_url(self, node):
     return 'http://{}:{}'.format(normalize_ipv6_url(node.external_address),
                                  self.params.get("alternator_port"))