def main(): args = get_args() cluster_spec = ClusterSpec() cluster_spec.parse(args.cluster) remote = RemoteHelper(cluster_spec, args.verbose) remote.get_manifest() projects = parse_manifest() fetch(projects)
def main(): args = get_args() cluster_spec = ClusterSpec() cluster_spec.parse(args.cluster_spec_fname) remote = RemoteHelper(cluster_spec, verbose=False) logger.info('Recovering system state') for host, version in remote.get_system_backup_version().items(): remote.start_system_state_recovery(host, version)
def main(): options, args = get_options() cluster_spec = ClusterSpec() cluster_spec.parse(options.cluster_spec_fname, args) remote = RemoteHelper(cluster_spec, test_config=None, verbose=False) logger.info('Recovering system state') for host, version in remote.get_system_backup_version().items(): remote.start_system_state_recovery(host, version)
def main(): options, args = get_options() cluster_spec = ClusterSpec() cluster_spec.parse(options.cluster_spec_fname, args) remote = RemoteHelper(cluster_spec, test_config=None, verbose=False) remote.collect_info() for hostname in cluster_spec.yield_hostnames(): for fname in glob.glob('{}/*.zip'.format(hostname)): shutil.move(fname, '{}.zip'.format(hostname))
class MongoDBInstaller(CouchbaseInstaller): URL = 'http://fastdl.mongodb.org/linux/mongodb-linux-x86_64-2.6.1.tgz' def __init__(self, cluster_spec, options): self.remote = RemoteHelper(cluster_spec, None, options.verbose) def uninstall_package(self): pass def clean_data(self): self.remote.clean_mongodb() def install_package(self): self.remote.install_mongodb(url=self.URL)
def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig, verbose: bool): self.cluster_spec = cluster_spec self.test_config = test_config self.target_iterator = TargetIterator(cluster_spec, test_config) self.cluster = ClusterManager(cluster_spec, test_config) self.memcached = MemcachedHelper(test_config) self.monitor = Monitor(cluster_spec, test_config, verbose) self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec, verbose) self.profiler = Profiler(cluster_spec, test_config) self.master_node = next(cluster_spec.masters) self.build = self.rest.get_version(self.master_node) self.metrics = MetricHelper(self) self.reporter = ShowFastReporter(cluster_spec, test_config, self.build) self.cbmonitor_snapshots = [] self.cbmonitor_clusters = [] if self.test_config.test_case.use_workers: self.worker_manager = WorkerManager(cluster_spec, test_config, verbose)
def __init__(self, cluster_spec, test_config, experiment=None): self.cluster_spec = cluster_spec self.test_config = test_config self.target_iterator = TargetIterator(self.cluster_spec, self.test_config) self.memcached = MemcachedHelper(cluster_spec) self.monitor = Monitor(cluster_spec) self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec) if experiment: self.experiment = ExperimentHelper(experiment, cluster_spec, test_config) self.master_node = cluster_spec.yield_masters().next() self.build = self.rest.get_version(self.master_node) self.cbagent = CbAgent(self) self.metric_helper = MetricHelper(self) self.reporter = Reporter(self) self.reports = {} self.snapshots = [] self.master_events = [] if self.test_config.test_case.use_workers: self.worker_manager = WorkerManager(cluster_spec, test_config)
def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig, verbose: bool = False): self.cluster_spec = cluster_spec self.test_config = test_config self.dynamic_infra = self.cluster_spec.dynamic_infrastructure self.rest = RestHelper(cluster_spec, test_config) self.remote = RemoteHelper(cluster_spec, verbose) self.monitor = Monitor(cluster_spec, test_config, verbose) self.memcached = MemcachedHelper(test_config) self.master_node = next(self.cluster_spec.masters) if self.dynamic_infra: self.initial_nodes = None else: self.initial_nodes = test_config.cluster.initial_nodes self.build = self.rest.get_version(self.master_node)
def main(): args = get_args() cluster_spec = ClusterSpec() cluster_spec.parse(args.cluster_spec_fname) remote = RemoteHelper(cluster_spec, test_config=None, verbose=False) remote.collect_info() for hostname in cluster_spec.servers: for fname in glob.glob('{}/*.zip'.format(hostname)): shutil.move(fname, '{}.zip'.format(hostname)) if cluster_spec.backup is not None: logs = os.path.join(cluster_spec.backup, 'logs') if os.path.exists(logs): shutil.make_archive('tools', 'zip', logs)
def __init__(self, cluster_spec, test_config): self.cluster_spec = cluster_spec self.test_config = test_config self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec) self.monitor = Monitor(cluster_spec) self.memcached = MemcachedHelper(cluster_spec) self.clusters = cluster_spec.yield_clusters() self.servers = cluster_spec.yield_servers self.masters = cluster_spec.yield_masters self.hostnames = cluster_spec.yield_hostnames self.initial_nodes = test_config.cluster.initial_nodes self.mem_quota = test_config.cluster.mem_quota self.group_number = test_config.cluster.group_number or 1
def main(): args = get_args() cluster_spec = ClusterSpec() cluster_spec.parse(args.cluster_spec_fname) remote = RemoteHelper(cluster_spec, verbose=False) remote.collect_info() for hostname in cluster_spec.servers: for fname in glob.glob('{}/*.zip'.format(hostname)): shutil.move(fname, '{}.zip'.format(hostname)) if cluster_spec.backup is not None: logs = os.path.join(cluster_spec.backup, 'logs') if os.path.exists(logs): shutil.make_archive('tools', 'zip', logs)
def __init__(self, test, verbose): self.clusters = OrderedDict() self.remote = RemoteHelper(test.cluster_spec, test.test_config, verbose=verbose) for cluster_name, servers in test.cluster_spec.yield_clusters(): cluster = '{}_{}_{}'.format(cluster_name, test.build.replace('.', ''), uhex()[:3]) master = servers[0].split(':')[0] self.clusters[cluster] = master if test.test_config.test_case.monitor_clients: for node in test.cluster_spec.workers: cluster = '{}{}'.format(self.clusters.items()[0][0][:-3], uhex()[:3]) master = node.split(':')[0] self.clusters[cluster] = master self.index_node = '' for _, servers in test.cluster_spec.yield_servers_by_role('index'): if servers: self.index_node = servers[0].split(':')[0] if hasattr(test, 'ALL_BUCKETS'): buckets = None else: buckets = test.test_config.buckets[:1] if hasattr(test, 'ALL_HOSTNAMES'): hostnames = tuple(test.cluster_spec.yield_hostnames()) else: hostnames = None self.settings = type('settings', (object,), { 'seriesly_host': test.test_config.stats_settings.seriesly['host'], 'cbmonitor_host_port': test.test_config.stats_settings.cbmonitor['host'], 'interval': test.test_config.stats_settings.interval, 'secondary_statsfile': test.test_config.stats_settings.secondary_statsfile, 'buckets': buckets, 'hostnames': hostnames, 'sync_gateway_nodes': test.remote.gateways if test.remote else None, 'monitor_clients': test.cluster_spec.workers if test.test_config.test_case.monitor_clients else None, 'fts_server': test.test_config.test_case.fts_server })() self.lat_interval = test.test_config.stats_settings.lat_interval if test.cluster_spec.ssh_credentials: self.settings.ssh_username, self.settings.ssh_password = \ test.cluster_spec.ssh_credentials self.settings.rest_username, self.settings.rest_password = \ test.cluster_spec.rest_credentials self.settings.bucket_password = test.test_config.bucket.password self.settings.index_node = self.index_node self.collectors = [] self.processes = [] self.snapshots = [] self.fts_stats = None
def __init__(self, cluster_spec, test_config, verbose): self.cluster_spec = cluster_spec self.test_config = test_config self.verbose = verbose self.snapshot = self.test_config.restore_settings.snapshot self.remote = RemoteHelper(self.cluster_spec, self.test_config, self.verbose)
def __init__(self, cluster_spec, options): self.remote_helper = RemoteHelper(cluster_spec) self.cluster_spec = cluster_spec arch = self.remote_helper.detect_arch() pkg = self.remote_helper.detect_pkg() openssl = self.remote_helper.detect_openssl(pkg) self.build = Build(arch, pkg, options.version, openssl, options.toy) logger.info('Target build info: {}'.format(self.build))
def calc_network_bandwidth(self): self.remote = RemoteHelper(self.cluster_spec, self.test_config, verbose=True) for cluster_name, servers in self.cluster_spec.yield_clusters(): self.in_bytes_transfer += [self.remote.read_bandwidth_stats("to", servers)] self.out_bytes_transfer += [self.remote.read_bandwidth_stats("from", servers)] logger.info('in bytes', self.in_bytes_transfer) logger.info('out bytes', self.out_bytes_transfer) return OrderedDict(( ('in bytes', sum(self.in_bytes_transfer[0].values())), ('out bytes', sum(self.out_bytes_transfer[0].values()))))
def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig, verbose: bool): self.cluster_spec = cluster_spec self.test_config = test_config self.remote = RemoteHelper(cluster_spec, verbose) self.workers = cycle(self.cluster_spec.workers) self.terminate() self.start() self.wait_until_workers_are_ready()
def __init__(self, test): self.clusters = OrderedDict() self.remote = RemoteHelper(test.cluster_spec, test.test_config, verbose=True) for cluster_name, servers in test.cluster_spec.yield_clusters(): cluster = "{}_{}_{}".format(cluster_name, test.build.replace(".", ""), uhex()[:3]) master = servers[0].split(":")[0] self.clusters[cluster] = master if test.test_config.test_case.monitor_clients: for node in test.cluster_spec.workers: cluster = "{}{}".format(self.clusters.items()[0][0][:-3], uhex()[:3]) master = node.split(":")[0] self.clusters[cluster] = master self.index_node = "" for _, servers in test.cluster_spec.yield_servers_by_role("index"): if servers: self.index_node = servers[0].split(":")[0] if hasattr(test, "ALL_BUCKETS"): buckets = None else: buckets = test.test_config.buckets[:1] if hasattr(test, "ALL_HOSTNAMES"): hostnames = tuple(test.cluster_spec.yield_hostnames()) else: hostnames = None self.settings = type( "settings", (object,), { "seriesly_host": test.test_config.stats_settings.seriesly["host"], "cbmonitor_host_port": test.test_config.stats_settings.cbmonitor["host"], "interval": test.test_config.stats_settings.interval, "secondary_statsfile": test.test_config.stats_settings.secondary_statsfile, "buckets": buckets, "hostnames": hostnames, "sync_gateway_nodes": test.remote.gateways if test.remote else None, "monitor_clients": test.cluster_spec.workers if test.test_config.test_case.monitor_clients else None, }, )() self.lat_interval = test.test_config.stats_settings.lat_interval if test.cluster_spec.ssh_credentials: self.settings.ssh_username, self.settings.ssh_password = test.cluster_spec.ssh_credentials self.settings.rest_username, self.settings.rest_password = test.cluster_spec.rest_credentials self.settings.bucket_password = test.test_config.bucket.password self.settings.index_node = self.index_node self.collectors = [] self.processes = [] self.snapshots = [] self.bandwidth = False
def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig, verbose: bool): self.cluster_spec = cluster_spec self.test_config = test_config self.memcached = MemcachedHelper(test_config) self.remote = RemoteHelper(cluster_spec, test_config, verbose) self.rest = RestHelper(cluster_spec) # self.build = os.environ.get('SGBUILD') or "0.0.0-000" self.master_node = next(cluster_spec.masters) self.build = self.rest.get_sgversion(self.master_node) self.metrics = MetricHelper(self) self.reporter = ShowFastReporter(cluster_spec, test_config, self.build) if self.test_config.test_case.use_workers: self.worker_manager = WorkerManager(cluster_spec, test_config, verbose) self.settings = self.test_config.access_settings self.settings.syncgateway_settings = self.test_config.syncgateway_settings self.profiler = Profiler(cluster_spec, test_config) self.cluster = ClusterManager(cluster_spec, test_config) self.target_iterator = TargetIterator(cluster_spec, test_config) self.monitor = Monitor(cluster_spec, test_config, verbose)
def __init__(self, cluster_spec, options): self.remote = RemoteHelper(cluster_spec, None, options.verbose) self.cluster_spec = cluster_spec arch = self.remote.detect_arch() pkg = self.remote.detect_pkg() release, build = options.version.split('-') self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/sherlock/{}/'.format(build) self.build = Build(arch, pkg, options.version, release, build, options.toy) logger.info('Target build info: {}'.format(self.build))
def main(): args = get_args() cluster_spec = ClusterSpec() cluster_spec.parse(args.cluster_spec_fname) remote = RemoteHelper(cluster_spec, verbose=False) remote.collect_info() for hostname in cluster_spec.servers: for fname in glob.glob('{}/*.zip'.format(hostname)): shutil.move(fname, '{}.zip'.format(hostname)) if cluster_spec.backup is not None: logs = os.path.join(cluster_spec.backup, 'logs') if os.path.exists(logs): shutil.make_archive('tools', 'zip', logs) failures = defaultdict(dict) for file_name in glob.iglob('./*.zip'): panic_files, crash_files, storage_corrupted = validate_logs(file_name) if panic_files: failures['panics'][file_name] = panic_files if crash_files: failures['crashes'][file_name] = crash_files if storage_corrupted: failures['storage_corrupted'][file_name] = True remote.collect_index_datafiles() if failures: logger.interrupt( "Following failures found: {}".format(pretty_dict(failures)))
def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig, verbose: bool = False): self.cluster_spec = cluster_spec self.test_config = test_config self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec, verbose) self.monitor = Monitor(cluster_spec, test_config, verbose) self.memcached = MemcachedHelper(test_config) self.master_node = next(self.cluster_spec.masters) self.initial_nodes = test_config.cluster.initial_nodes
def __init__(self, cluster_spec, options): self.options = options self.remote = RemoteHelper(cluster_spec, None, options.verbose) self.cluster_spec = cluster_spec arch = self.remote.detect_arch() pkg = self.remote.detect_pkg() release = None build = None if options.version: release, build = options.version.split('-') self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/sherlock/{}/'.format( build) self.WATSON_BUILDS = 'http://172.23.120.24/builds/latestbuilds/couchbase-server/watson/{}/'.format( build) if options.toy: self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/toy-{}/{}/'.format( options.toy, build) self.build = Build(arch, pkg, options.cluster_edition, options.version, release, build, options.toy, options.url) logger.info('Target build info: {}'.format(self.build))
def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig, verbose: bool): self.cluster_spec = cluster_spec self.test_config = test_config self.dynamic_infra = self.cluster_spec.dynamic_infrastructure self.target_iterator = TargetIterator(cluster_spec, test_config) self.cluster = ClusterManager(cluster_spec, test_config) self.remote = RemoteHelper(cluster_spec, verbose) self.profiler = Profiler(cluster_spec, test_config) self.master_node = next(cluster_spec.masters) self.memcached = MemcachedHelper(test_config) self.monitor = Monitor(cluster_spec, test_config, verbose) self.rest = RestHelper(cluster_spec) self.build = self.rest.get_version(self.master_node) self.metrics = MetricHelper(self) self.reporter = ShowFastReporter(cluster_spec, test_config, self.build) self.cbmonitor_snapshots = [] self.cbmonitor_clusters = [] if self.test_config.test_case.use_workers: self.worker_manager = WorkerManager(cluster_spec, test_config, verbose)
class FIOTest(PerfTest): TRACKER = 'fio.sc.couchbase.com' TEMPLATE = { 'group': '{}, random mixed reads and writes, IOPS', 'metric': None, 'value': None, } def __init__(self, cluster_spec, test_config, verbose): self.cluster_spec = cluster_spec self.test_config = test_config self.remote = RemoteHelper(cluster_spec, test_config, verbose) def __exit__(self, *args, **kwargs): pass @staticmethod def _parse(results): """Parse the test output. See also https://github.com/axboe/fio/blob/master/HOWTO """ stats = defaultdict(int) for host, output in results.items(): for job in output.split(): stats[host] += int(job.split(';')[7]) # reads stats[host] += int(job.split(';')[48]) # writes return stats def _post(self, data): data = pretty_dict(data) logger.info('Posting: {}'.format(data)) requests.post('http://{}/api/v1/benchmarks'.format(self.TRACKER), data=data) def _report_kpi(self, stats): for host, iops in stats.items(): data = self.TEMPLATE.copy() data['group'] = data['group'].format( self.cluster_spec.name.title()) data['metric'] = host data['value'] = iops self._post(data) def run(self): stats = self.remote.fio(self.test_config.fio['config']) self._report_kpi(self._parse(stats))
def download_local(self): """Download and save a copy of the specified package.""" try: if RemoteHelper.detect_server_os("127.0.0.1").upper() in ('UBUNTU', 'DEBIAN'): os_release = detect_ubuntu_release() url = self.find_package(edition=self.options.edition, package="deb", os_release=os_release) logger.info('Saving a local copy of {}'.format(url)) with open('couchbase.deb', 'wb') as fh: resp = requests.get(url) fh.write(resp.content) except (Exception, BaseException): logger.info("Saving local copy for ubuntu failed, package may not present")
class FIOTest(PerfTest): TRACKER = 'fio.sc.couchbase.com' TEMPLATE = { 'group': '{}, random mixed reads and writes, IOPS', 'metric': None, 'value': None, } def __init__(self, cluster_spec, test_config, verbose): self.cluster_spec = cluster_spec self.test_config = test_config self.remote = RemoteHelper(cluster_spec, test_config, verbose) def __exit__(self, *args, **kwargs): pass @staticmethod def _parse(results): """Terse output parsing is based on the following guide: https://github.com/axboe/fio/blob/master/HOWTO """ stats = defaultdict(int) for host, output in results.items(): for job in output.split(): stats[host] += int(job.split(';')[7]) # reads stats[host] += int(job.split(';')[48]) # writes return stats def _post(self, data): data = pretty_dict(data) logger.info('Posting: {}'.format(data)) requests.post('http://{}/api/v1/benchmarks'.format(self.TRACKER), data=data) def _report_kpi(self, stats): for host, iops in stats.items(): data = self.TEMPLATE.copy() data['group'] = data['group'].format(self.cluster_spec.name.title()) data['metric'] = host data['value'] = iops self._post(data) def run(self): stats = self.remote.fio(self.test_config.fio['config']) self._report_kpi(self._parse(stats))
def __init__(self, *args, **kwargs): options, args = get_options() self.cluster_spec = ClusterSpec() self.cluster_spec.parse(options.cluster_spec_fname, args) self.test_config = TestConfig() self.test_config.parse(options.test_config_fname, args) self.target_iterator = TargetIterator(self.cluster_spec, self.test_config) self.memcached = MemcachedHelper(self.test_config) self.remote = RemoteHelper(self.cluster_spec, self.test_config) self.rest = RestHelper(self.cluster_spec) super(FunctionalTest, self).__init__(*args, **kwargs)
def __init__(self, cluster_spec, options): self.options = options self.cluster_spec = cluster_spec self.operator_version = self.options.operator_version self.couchbase_version = self.options.couchbase_version self.node_count = len(self.cluster_spec.infrastructure_clusters['couchbase1'].split()) self.remote = RemoteHelper(cluster_spec) self.release = self.operator_version.split("-")[0] self.build = self.operator_version.split("-")[1] self.docker_config_path = os.path.expanduser("~") + "/.docker/config.json" self.operator_base_path = "cloud/operator/{}/{}".format(self.release[0], self.release[2]) self.certificate_authority_path = "{}/ca.crt".format(self.operator_base_path) self.crd_path = "{}/crd.yaml".format(self.operator_base_path) self.config_path = "{}/config.yaml".format(self.operator_base_path) self.config_template_path = "{}/config_template.yaml".format(self.operator_base_path) self.auth_path = "{}/auth_secret.yaml".format(self.operator_base_path) self.cb_cluster_path = "{}/couchbase-cluster.yaml".format(self.operator_base_path) self.template_cb_cluster_path = "{}/couchbase-cluster_template.yaml"\ .format(self.operator_base_path) self.worker_base_path = "cloud/worker" self.worker_path = "{}/worker.yaml".format(self.worker_base_path) self.rmq_base_path = "cloud/broker/rabbitmq/0.48" self.rmq_operator_path = "{}/cluster-operator.yaml".format(self.rmq_base_path) self.rmq_cluster_path = "{}/rabbitmq.yaml".format(self.rmq_base_path)
def __init__(self, cluster_spec, test_config, verbose): self.cluster_spec = cluster_spec self.test_config = test_config self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec, test_config, verbose) self.monitor = Monitor(cluster_spec) self.memcached = MemcachedHelper(test_config) self.clusters = cluster_spec.yield_clusters() self.servers = cluster_spec.yield_servers self.masters = cluster_spec.yield_masters self.initial_nodes = test_config.cluster.initial_nodes self.mem_quota = test_config.cluster.mem_quota self.group_number = test_config.cluster.group_number or 1
def __init__(self, *args, **kwargs): options, _args = get_options() override = \ _args and (arg.split('.') for arg in ' '.join(_args).split(',')) self.cluster_spec = ClusterSpec() self.cluster_spec.parse(options.cluster_spec_fname) self.test_config = TestConfig() self.test_config.parse(options.test_config_fname, override) self.target_iterator = TargetIterator(self.cluster_spec, self.test_config) self.memcached = MemcachedHelper(self.cluster_spec) self.remote = RemoteHelper(self.cluster_spec) super(FunctionalTest, self).__init__(*args, **kwargs)
def __init__(self, cluster_spec, options): self.options = options self.remote = RemoteHelper(cluster_spec, None, options.verbose) self.cluster_spec = cluster_spec arch = self.remote.detect_arch() pkg = self.remote.detect_pkg() release = None build = None if options.version: release, build = options.version.split('-') self.build = Build(arch, pkg, options.cluster_edition, options.version, release, build, options.url) logger.info('Target build info: {}'.format(self.build))
def __init__(self, cluster_spec, options): self.options = options self.remote = RemoteHelper(cluster_spec, None, options.verbose) self.cluster_spec = cluster_spec arch = self.remote.detect_arch() pkg = self.remote.detect_pkg() release = None build = None if options.version: release, build = options.version.split('-') self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/sherlock/{}/'.format(build) self.WATSON_BUILDS = 'http://172.23.120.24/builds/latestbuilds/couchbase-server/watson/{}/'.format(build) if options.toy: self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/toy-{}/{}/'.format(options.toy, build) self.build = Build(arch, pkg, options.cluster_edition, options.version, release, build, options.toy, options.url) logger.info('Target build info: {}'.format(self.build))
class Monitor(RestHelper): MAX_RETRY = 150 MAX_RETRY_RECOVERY = 1200 MAX_RETRY_TIMER_EVENT = 18000 MAX_RETRY_BOOTSTRAP = 1200 MONITORING_DELAY = 5 POLLING_INTERVAL = 2 POLLING_INTERVAL_INDEXING = 1 POLLING_INTERVAL_MACHINE_UP = 10 POLLING_INTERVAL_ANALYTICS = 15 POLLING_INTERVAL_EVENTING = 1 REBALANCE_TIMEOUT = 3600 * 6 TIMEOUT = 3600 * 12 DISK_QUEUES = ( 'ep_queue_size', 'ep_flusher_todo', 'ep_diskqueue_items', 'vb_active_queue_size', 'vb_replica_queue_size', ) DCP_QUEUES = ( 'ep_dcp_replica_items_remaining', 'ep_dcp_other_items_remaining', ) XDCR_QUEUES = ( 'replication_changes_left', ) def __init__(self, cluster_spec, test_config, verbose): super().__init__(cluster_spec=cluster_spec) self.cluster_spec = cluster_spec self.test_config = test_config self.remote = RemoteHelper(cluster_spec, verbose) def monitor_rebalance(self, host): logger.info('Monitoring rebalance status') is_running = True last_progress = 0 last_progress_time = time.time() while is_running: time.sleep(self.POLLING_INTERVAL) is_running, progress = self.get_task_status(host, task_type='rebalance') if progress == last_progress: if time.time() - last_progress_time > self.REBALANCE_TIMEOUT: logger.error('Rebalance hung') break else: last_progress = progress last_progress_time = time.time() if progress is not None: logger.info('Rebalance progress: {} %'.format(progress)) logger.info('Rebalance completed') def _wait_for_empty_queues(self, host, bucket, queues, stats_function): metrics = list(queues) start_time = time.time() while metrics: bucket_stats = stats_function(host, bucket) # As we are changing metrics in the loop; take a copy of it to # iterate over. for metric in list(metrics): stats = bucket_stats['op']['samples'].get(metric) if stats: last_value = stats[-1] if last_value: logger.info('{} = {:,}'.format(metric, last_value)) continue else: logger.info('{} reached 0'.format(metric)) metrics.remove(metric) if metrics: time.sleep(self.POLLING_INTERVAL) if time.time() - start_time > self.TIMEOUT: raise Exception('Monitoring got stuck') def _wait_for_replication_completion(self, host, bucket, queues, stats_function, link1, link2): metrics = list(queues) completion_count = 0 link1_time = 0 link2_items = 0 link1_compelteness_str = \ 'replications/{}/bucket-1/bucket-1/percent_completeness'.format(link1) link2_compelteness_str = \ 'replications/{}/bucket-1/bucket-1/percent_completeness'.format(link2) link2_items_str = \ 'replications/{}/bucket-1/bucket-1/docs_written'.format(link2) start_time = time.time() while metrics: bucket_stats = stats_function(host, bucket) # As we are changing metrics in the loop; take a copy of it to # iterate over. for metric in list(metrics): stats = bucket_stats['op']['samples'].get(metric) if stats: last_value = stats[-1] if last_value: logger.info('{} = {:,}'.format(metric, last_value)) link1_completeness = \ bucket_stats['op']['samples'].get(link1_compelteness_str)[-1] link2_completeness = \ bucket_stats['op']['samples'].get(link2_compelteness_str)[-1] if link1_completeness == 100 or \ link2_completeness == 100: if link1_completeness == 100: if completion_count == 0: link1_time = time.time() link2_items = \ bucket_stats['op']['samples'].get(link2_items_str)[-1] completion_count = completion_count + 1 elif link2_completeness == 100: if completion_count == 0: link1_time = time.time() link2_items = \ bucket_stats['op']['samples'].get(link2_items_str)[-1] completion_count = completion_count + 1 continue else: logger.info('{} reached 0'.format(metric)) if completion_count == 0: link1_time = time.time() link2_items = \ bucket_stats['op']['samples'].get(link2_items_str)[-1] completion_count = completion_count + 1 metrics.remove(metric) if metrics: time.sleep(self.POLLING_INTERVAL) if time.time() - start_time > self.TIMEOUT: raise Exception('Monitoring got stuck') return link1_time, link2_items def _wait_for_completeness(self, host, bucket, xdcr_link, stats_function): metrics = [] metrics.append(xdcr_link) start_time = time.time() while metrics: bucket_stats = stats_function(host, bucket) for metric in metrics: stats = bucket_stats['op']['samples'].get(metric) if stats: last_value = stats[0] if last_value != 100: logger.info('{} : {}'.format(metric, last_value)) elif last_value == 100: logger.info('{} Completed 100 %'.format(metric)) metrics.remove(metric) if metrics: time.sleep(self.POLLING_INTERVAL) if time.time() - start_time > self.TIMEOUT: raise Exception('Monitoring got stuck') def monitor_disk_queues(self, host, bucket): logger.info('Monitoring disk queues: {}'.format(bucket)) self._wait_for_empty_queues(host, bucket, self.DISK_QUEUES, self.get_bucket_stats) def monitor_dcp_queues(self, host, bucket): logger.info('Monitoring DCP queues: {}'.format(bucket)) self._wait_for_empty_queues(host, bucket, self.DCP_QUEUES, self.get_bucket_stats) def _wait_for_xdcr_to_start(self, host: str): is_running = False while not is_running: time.sleep(self.POLLING_INTERVAL) is_running, _ = self.get_task_status(host, task_type='xdcr') def xdcr_link_starttime(self, host: str, uuid: str): is_running = False while not is_running: time.sleep(self.POLLING_INTERVAL) is_running, _ = self.get_xdcrlink_status(host, task_type='xdcr', uuid=uuid) return time.time() def monitor_xdcr_queues(self, host: str, bucket: str): logger.info('Monitoring XDCR queues: {}'.format(bucket)) self._wait_for_xdcr_to_start(host) self._wait_for_empty_queues(host, bucket, self.XDCR_QUEUES, self.get_xdcr_stats) def monitor_xdcr_changes_left(self, host: str, bucket: str, xdcrlink1: str, xdcrlink2: str): logger.info('Monitoring XDCR queues: {}'.format(bucket)) self._wait_for_xdcr_to_start(host) start_time = time.time() link1_time, link2_items = self._wait_for_replication_completion(host, bucket, self.XDCR_QUEUES, self.get_xdcr_stats, xdcrlink1, xdcrlink2) return start_time, link1_time, link2_items def monitor_xdcr_completeness(self, host: str, bucket: str, xdcr_link: str): logger.info('Monitoring XDCR Link Completeness: {}'.format(bucket)) self._wait_for_completeness(host=host, bucket=bucket, xdcr_link=xdcr_link, stats_function=self.get_xdcr_stats) return time.time() def _get_num_items(self, host: str, bucket: str, total: bool = False) -> int: stats = self.get_bucket_stats(host=host, bucket=bucket) if total: curr_items = stats['op']['samples'].get('curr_items_tot') else: curr_items = stats['op']['samples'].get('curr_items') if curr_items: return curr_items[-1] return 0 def monitor_num_items(self, host: str, bucket: str, num_items: int): logger.info('Checking the number of items in {}'.format(bucket)) retries = 0 while retries < self.MAX_RETRY: curr_items = self._get_num_items(host, bucket, total=True) if curr_items == num_items: break else: logger.info('{}(curr_items) != {}(num_items)'.format(curr_items, num_items)) time.sleep(self.POLLING_INTERVAL) retries += 1 else: actual_items = self._get_num_items(host, bucket, total=True) raise Exception('Mismatch in the number of items: {}' .format(actual_items)) def monitor_task(self, host, task_type): logger.info('Monitoring task: {}'.format(task_type)) time.sleep(self.MONITORING_DELAY) while True: time.sleep(self.POLLING_INTERVAL) tasks = [task for task in self.get_tasks(host) if task.get('type') == task_type] if tasks: for task in tasks: logger.info('{}: {}%, bucket: {}, ddoc: {}'.format( task_type, task.get('progress'), task.get('bucket'), task.get('designDocument') )) else: break logger.info('Task {} successfully completed'.format(task_type)) def monitor_warmup(self, memcached, host, bucket): logger.info('Monitoring warmup status: {}@{}'.format(bucket, host)) memcached_port = self.get_memcached_port(host) while True: stats = memcached.get_stats(host, memcached_port, bucket, 'warmup') if b'ep_warmup_state' in stats: state = stats[b'ep_warmup_state'] if state == b'done': return float(stats.get(b'ep_warmup_time', 0)) else: logger.info('Warmpup status: {}'.format(state)) time.sleep(self.POLLING_INTERVAL) else: logger.info('No warmup stats are available, continue polling') time.sleep(self.POLLING_INTERVAL) def monitor_compression(self, memcached, host, bucket): logger.info('Monitoring active compression status') memcached_port = self.get_memcached_port(host) json_docs = -1 while json_docs: stats = memcached.get_stats(host, memcached_port, bucket) json_docs = int(stats[b'ep_active_datatype_json']) if json_docs: logger.info('Still uncompressed: {:,} items'.format(json_docs)) time.sleep(self.POLLING_INTERVAL) logger.info('All items are compressed') def monitor_node_health(self, host): logger.info('Monitoring node health') for retry in range(self.MAX_RETRY): unhealthy_nodes = { n for n, status in self.node_statuses(host).items() if status != 'healthy' } | { n for n, status in self.node_statuses_v2(host).items() if status != 'healthy' } if unhealthy_nodes: time.sleep(self.POLLING_INTERVAL) else: break else: logger.interrupt('Some nodes are not healthy: {}'.format( unhealthy_nodes )) def monitor_analytics_node_active(self, host): logger.info('Monitoring analytics node health') for retry in range(self.MAX_RETRY): active = self.analytics_node_active(host) if active: break else: time.sleep(self.POLLING_INTERVAL) else: logger.interrupt('Analytics node still not healthy: {}'.format( host )) def is_index_ready(self, host: str) -> bool: for status in self.get_index_status(host)['status']: if status['status'] != 'Ready': return False return True def estimate_pending_docs(self, host: str) -> int: stats = self.get_gsi_stats(host) pending_docs = 0 for metric, value in stats.items(): if 'num_docs_queued' in metric or 'num_docs_pending' in metric: pending_docs += value return pending_docs def monitor_indexing(self, host): logger.info('Monitoring indexing progress') while not self.is_index_ready(host): time.sleep(self.POLLING_INTERVAL_INDEXING * 5) pending_docs = self.estimate_pending_docs(host) logger.info('Pending docs: {:,}'.format(pending_docs)) logger.info('Indexing completed') def wait_for_secindex_init_build(self, host, indexes): # POLL until initial index build is complete logger.info( "Waiting for the following indexes to be ready: {}".format(indexes)) indexes_ready = [0 for _ in indexes] def get_index_status(json2i, index): """Return the index status.""" for d in json2i["status"]: if d["name"] == index: return d["status"] return None @misc.retry(catch=(KeyError,), iterations=10, wait=30) def update_indexes_ready(): json2i = self.get_index_status(host) for i, index in enumerate(indexes): status = get_index_status(json2i, index) if status == 'Ready': indexes_ready[i] = 1 init_ts = time.time() while sum(indexes_ready) != len(indexes): time.sleep(self.POLLING_INTERVAL_INDEXING) update_indexes_ready() finish_ts = time.time() logger.info('secondary index build time: {}'.format(finish_ts - init_ts)) time_elapsed = round(finish_ts - init_ts) return time_elapsed def wait_for_secindex_incr_build(self, index_nodes, bucket, indexes, numitems): # POLL until incremenal index build is complete logger.info('expecting {} num_docs_indexed for indexes {}'.format(numitems, indexes)) # collect num_docs_indexed information globally from all index nodes def get_num_docs_indexed(): data = self.get_index_stats(index_nodes) num_indexed = [] for index in indexes: key = "" + bucket + ":" + index + ":num_docs_indexed" val = data[key] num_indexed.append(val) return num_indexed def get_num_docs_index_pending(): data = self.get_index_stats(index_nodes) num_pending = [] for index in indexes: key = "" + bucket + ":" + index + ":num_docs_pending" val1 = data[key] key = "" + bucket + ":" + index + ":num_docs_queued" val2 = data[key] val = int(val1) + int(val2) num_pending.append(val) return num_pending expected_num_pending = [0] * len(indexes) while True: time.sleep(self.POLLING_INTERVAL_INDEXING) curr_num_pending = get_num_docs_index_pending() if curr_num_pending == expected_num_pending: break curr_num_indexed = get_num_docs_indexed() logger.info("Number of Items indexed {}".format(curr_num_indexed)) def wait_for_num_connections(self, index_node, expected_connections): curr_connections = self.get_index_num_connections(index_node) retry = 1 while curr_connections < expected_connections and retry < self.MAX_RETRY: time.sleep(self.POLLING_INTERVAL_INDEXING) curr_connections = self.get_index_num_connections(index_node) logger.info("Got current connections {}".format(curr_connections)) retry += 1 if retry == self.MAX_RETRY: return False return True def wait_for_recovery(self, index_nodes, bucket, index): time.sleep(self.MONITORING_DELAY) for retry in range(self.MAX_RETRY_RECOVERY): response = self.get_index_stats(index_nodes) item = "{}:{}:disk_load_duration".format(bucket, index) if item in response: return response[item] else: time.sleep(self.POLLING_INTERVAL) return -1 def wait_for_servers(self): for retry in range(self.MAX_RETRY): logger.info('Waiting for all servers to be available') time.sleep(self.POLLING_INTERVAL_MACHINE_UP) for server in self.cluster_spec.servers: if not self.remote.is_up(server): break else: logger.info('All nodes are up') return logger.interrupt('Some nodes are still down') def monitor_fts_indexing_queue(self, host: str, index: str, items: int): logger.info('Waiting for indexing to finish') count = 0 while count < items: count = self.get_fts_doc_count(host, index) logger.info('FTS indexed documents: {:,}'.format(count)) time.sleep(self.POLLING_INTERVAL) def monitor_fts_index_persistence(self, hosts: list, index: str, bkt: str = None): logger.info('Waiting for index to be persisted') if not bkt: bkt = self.test_config.buckets[0] pending_items = 1 while pending_items: persist = 0 compact = 0 for host in hosts: stats = self.get_fts_stats(host) metric = '{}:{}:{}'.format(bkt, index, 'num_recs_to_persist') persist += stats[metric] metric = '{}:{}:{}'.format(bkt, index, 'total_compactions') compact += stats[metric] pending_items = persist or compact logger.info('Records to persist: {:,}'.format(persist)) logger.info('Ongoing compactions: {:,}'.format(compact)) time.sleep(self.POLLING_INTERVAL) def monitor_elastic_indexing_queue(self, host: str, index: str): logger.info(' Waiting for indexing to finish') items = int(self.test_config.fts_settings.test_total_docs) count = 0 while count < items: count = self.get_elastic_doc_count(host, index) logger.info('Elasticsearch indexed documents: {:,}'.format(count)) time.sleep(self.POLLING_INTERVAL) def monitor_elastic_index_persistence(self, host: str, index: str): logger.info('Waiting for index to be persisted') pending_items = -1 while pending_items: stats = self.get_elastic_stats(host) pending_items = stats['indices'][index]['total']['translog']['operations'] logger.info('Records to persist: {:,}'.format(pending_items)) time.sleep(self.POLLING_INTERVAL) def wait_for_bootstrap(self, nodes: list, function: str): logger.info('Waiting for bootstrap of eventing function: {} '.format(function)) for node in nodes: retry = 1 while retry < self.MAX_RETRY_BOOTSTRAP: if function in self.get_apps_with_status(node, "deployed"): break time.sleep(self.POLLING_INTERVAL) retry += 1 if retry == self.MAX_RETRY_BOOTSTRAP: logger.info('Failed to bootstrap function: {}, node: {}'. format(function, node)) def get_num_analytics_items(self, data_node: str, bucket: str) -> int: stats_key = '{}:all:incoming_records_count_total'.format(bucket) num_items = 0 for node in self.get_active_nodes_by_role(data_node, 'cbas'): stats = self.get_analytics_stats(node) num_items += stats[stats_key] return num_items def monitor_data_synced(self, data_node: str, bucket: str) -> int: logger.info('Waiting for data to be synced from {}'.format(data_node)) num_items = self._get_num_items(data_node, bucket) while True: num_analytics_items = self.get_num_analytics_items(data_node, bucket) if num_analytics_items == num_items: break logger.info('Analytics has {:,} docs (target is {:,})'.format( num_analytics_items, num_items)) time.sleep(self.POLLING_INTERVAL_ANALYTICS) return num_items def wait_for_timer_event(self, node: str, function: str, event="timer_events"): logger.info('Waiting for timer events to start processing: {} '.format(function)) retry = 1 while retry < self.MAX_RETRY_TIMER_EVENT: if 0 < self.get_num_events_processed( event=event, node=node, name=function): break time.sleep(self.POLLING_INTERVAL_EVENTING) retry += 1 if retry == self.MAX_RETRY_TIMER_EVENT: logger.info('Failed to get timer event for function: {}'.format(function)) def wait_for_all_mutations_processed(self, host: str, bucket1: str, bucket2: str): logger.info('Waiting for mutations to be processed of eventing function') retry = 1 while retry < self.MAX_RETRY_BOOTSTRAP: if self._get_num_items(host=host, bucket=bucket1) == \ self._get_num_items(host=host, bucket=bucket2): break retry += 1 time.sleep(self.POLLING_INTERVAL_EVENTING) if retry == self.MAX_RETRY_BOOTSTRAP: logger.info('Failed to process all mutations... TIMEOUT') def wait_for_all_timer_creation(self, node: str, function: str): logger.info('Waiting for all timers to be created by : {} '.format(function)) retry = 1 events_processed = {} while retry < self.MAX_RETRY_TIMER_EVENT: events_processed = self.get_num_events_processed(event="ALL", node=node, name=function) if events_processed["dcp_mutation"] == events_processed["timer_responses_received"]: break time.sleep(self.POLLING_INTERVAL_EVENTING) retry += 1 if retry == self.MAX_RETRY_TIMER_EVENT: logger.info('Got only {} timers created for function: {}'.format( events_processed["timer_responses_received"], function)) def wait_for_function_undeploy(self, node: str, function: str): logger.info('Waiting for {} function to undeploy'.format(function)) retry = 1 while retry < self.MAX_RETRY_TIMER_EVENT: op = self.get_apps_with_status(node, "undeployed") if function in op: break time.sleep(self.POLLING_INTERVAL_EVENTING) retry += 1 if retry == self.MAX_RETRY_TIMER_EVENT: logger.info('Function {} failed to undeploy...!!!'.format(function))
def __init__(self, cluster_spec, test_config, verbose): super().__init__(cluster_spec=cluster_spec) self.cluster_spec = cluster_spec self.test_config = test_config self.remote = RemoteHelper(cluster_spec, verbose)
class GatewayInstaller(object): CBFS = 'http://cbfs-ext.hq.couchbase.com/builds/' def __init__(self, cluster_spec, test_config, options): self.remote_helper = RemoteHelper(cluster_spec) self.cluster_spec = cluster_spec self.test_config = test_config self.pkg = self.remote_helper.detect_pkg() self.version = options.version def find_package(self): filename = 'couchbase-sync-gateway_{}_x86_64.rpm'.format(self.version) url = '{}{}'.format(self.CBFS, filename) try: status_code = requests.head(url).status_code except requests.exceptions.ConnectionError: pass else: if status_code == 200: logger.info('Found "{}"'.format(url)) return filename, url logger.interrupt('Target build not found - {}'.format(url)) def kill_processes_gw(self): self.remote_helper.kill_processes_gw() def kill_processes_gl(self): self.remote_helper.kill_processes_gl() def uninstall_package_gw(self): filename, url = self.find_package() self.remote_helper.uninstall_package_gw(self.pkg, filename) def uninstall_package_gl(self): self.remote_helper.uninstall_package_gl() def install_package_gw(self): filename, url = self.find_package() self.remote_helper.install_package_gw(self.pkg, url, filename, self.version) def install_package_gl(self): self.remote_helper.install_package_gl() def start_sync_gateways(self): with open('templates/gateway_config_template.json') as fh: template = json.load(fh) db_master = self.cluster_spec.yield_masters().next() template['databases']['db']['server'] = "http://*****:*****@{}/".format(db_master) template['maxIncomingConnections'] = self.test_config.gateway_settings.conn_in template['maxCouchbaseConnections'] = self.test_config.gateway_settings.conn_db template['CompressResponses'] = self.test_config.gateway_settings.compression with open('templates/gateway_config.json', 'w') as fh: fh.write(pretty_dict(template)) self.remote_helper.start_sync_gateway() def install(self): num_gateways = len(self.cluster_spec.gateways) num_gateloads = len(self.cluster_spec.gateloads) if num_gateways != num_gateloads: logger.interrupt( 'The cluster config file has different number of gateways({}) and gateloads({})' .format(num_gateways, num_gateloads) ) self.kill_processes_gw() self.uninstall_package_gw() self.install_package_gw() self.start_sync_gateways() self.kill_processes_gl() self.uninstall_package_gl() self.install_package_gl()
def __init__(self, cluster_spec, options): self.remote = RemoteHelper(cluster_spec, options.verbose) self.options = options self.cluster_spec = cluster_spec
class CouchbaseInstaller(object): CBFS = 'http://cbfs-ext.hq.couchbase.com/builds/' LATEST_BUILDS = 'http://latestbuilds.hq.couchbase.com/' SHERLOCK_BUILDS = '' WATSON_BUILDS = '' def __init__(self, cluster_spec, options): self.options = options self.remote = RemoteHelper(cluster_spec, None, options.verbose) self.cluster_spec = cluster_spec arch = self.remote.detect_arch() pkg = self.remote.detect_pkg() release = None build = None if options.version: release, build = options.version.split('-') self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/sherlock/{}/'.format(build) self.WATSON_BUILDS = 'http://172.23.120.24/builds/latestbuilds/couchbase-server/watson/{}/'.format(build) if options.toy: self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/toy-{}/{}/'.format(options.toy, build) self.build = Build(arch, pkg, options.cluster_edition, options.version, release, build, options.toy, options.url) logger.info('Target build info: {}'.format(self.build)) def get_expected_filenames(self): patterns = () # Sentinel if self.build.toy: patterns = ( 'couchbase-server-community_toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_toy-{toy}-{version}-toy_{arch}.{pkg}', 'couchbase-server-community_cent58-2.5.2-toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_cent58-3.0.0-toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_ubunt12-3.0.0-toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_cent64-3.0.0-toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_cent64-3.0.1-toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_cent58-master-toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_cent54-master-toy-{toy}-{arch}_{version}-toy.{pkg}', # For toy builds >= Sherlock 'couchbase-server-{edition}-{version}-centos6.{arch}.{pkg}', 'couchbase-server-{edition}_{version}-ubuntu12.04_amd64.{pkg}', ) elif self.build.pkg == 'rpm': patterns = ( 'couchbase-server-{edition}_centos6_{arch}_{version}-rel.{pkg}', 'couchbase-server-{edition}-{version}-centos6.{arch}.{pkg}', 'couchbase-server-{edition}_{arch}_{version}-rel.{pkg}', 'couchbase-server-{edition}-{version}.{arch}.{pkg}', ) elif self.build.pkg == 'deb': patterns = ( 'couchbase-server-{edition}_ubuntu_1204_{arch}_{version}-rel.{pkg}', 'couchbase-server-{edition}_{version}-ubuntu12.04_amd64.{pkg}', 'couchbase-server-{edition}_{arch}_{version}-rel.{pkg}', 'couchbase-server-{edition}_{version}_amd64.{pkg}', ) elif self.build.pkg == 'exe': patterns = ( 'couchbase-server-{edition}_{arch}_{version}-rel.setup.{pkg}', 'couchbase_server-{edition}-windows-amd64-{version}.{pkg}', 'couchbase-server-{edition}_{version}-windows_amd64.{pkg}', 'couchbase_server/{release}/{build}/couchbase_server-{edition}-windows-amd64-{version}.exe', 'couchbase-server-{edition}_{version}-windows_amd64.{pkg}', ) for pattern in patterns: yield pattern.format(**self.build._asdict()) def find_package(self): for filename in self.get_expected_filenames(): for base in (self.LATEST_BUILDS, self.SHERLOCK_BUILDS, self.WATSON_BUILDS, self.CBFS): url = '{}{}'.format(base, filename) try: status_code = requests.head(url).status_code except ConnectionError: continue else: if status_code == 200: logger.info('Found "{}"'.format(url)) return filename, url logger.interrupt('Target build not found') def kill_processes(self): self.remote.kill_processes() def uninstall_package(self): self.remote.uninstall_couchbase(self.build.pkg) def clean_data(self): self.remote.clean_data() def install_package(self): if not self.options.url: filename, url = self.find_package() else: url = self.options.url logger.info("Using this URL to install instead of searching amongst" " the known locations: {}".format(url)) # obtain the filename after the last '/' of a url. filename = urlparse(url).path.split('/')[-1] self.remote.install_couchbase(self.build.pkg, url, filename, self.build.release) def install(self): self.kill_processes() self.uninstall_package() self.clean_data() self.install_package()
class ClusterManager(object): def __init__(self, cluster_spec, test_config, verbose): self.cluster_spec = cluster_spec self.test_config = test_config self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec, test_config, verbose) self.monitor = Monitor(cluster_spec) self.memcached = MemcachedHelper(test_config) self.clusters = cluster_spec.yield_clusters self.servers = cluster_spec.yield_servers self.masters = cluster_spec.yield_masters self.initial_nodes = test_config.cluster.initial_nodes self.mem_quota = test_config.cluster.mem_quota self.index_mem_quota = test_config.cluster.index_mem_quota self.group_number = test_config.cluster.group_number or 1 self.roles = cluster_spec.roles def set_data_path(self): if self.cluster_spec.paths: data_path, index_path = self.cluster_spec.paths for server in self.servers(): self.rest.set_data_path(server, data_path, index_path) def set_auth(self): for server in self.servers(): self.rest.set_auth(server) def set_mem_quota(self): for server in self.servers(): self.rest.set_mem_quota(server, self.mem_quota) def set_index_mem_quota(self): for server in self.servers(): self.rest.set_index_mem_quota(server, self.index_mem_quota) def set_query_settings(self): settings = self.test_config.n1ql_settings.settings for _, servers in self.cluster_spec.yield_servers_by_role('n1ql'): for server in servers: self.rest.set_query_settings(server, settings) def set_index_settings(self): settings = self.test_config.secondaryindex_settings.settings for _, servers in self.cluster_spec.yield_servers_by_role('index'): for server in servers: self.rest.set_index_settings(server, settings) self.remote.restart() time.sleep(60) def set_services(self): for (_, servers), initial_nodes in zip(self.clusters(), self.initial_nodes): master = servers[0] self.rest.set_services(master, self.roles[master]) def disable_moxi(self): if self.test_config.cluster.disable_moxi is not None: self.remote.disable_moxi() def create_server_groups(self): for master in self.masters(): for i in range(1, self.group_number): name = 'Group {}'.format(i + 1) self.rest.create_server_group(master, name=name) def add_nodes(self): for (_, servers), initial_nodes in zip(self.clusters(), self.initial_nodes): if initial_nodes < 2: # Single-node cluster continue # Adding initial nodes master = servers[0] if self.group_number > 1: groups = self.rest.get_server_groups(master) else: groups = {} for i, host_port in enumerate(servers[1:initial_nodes], start=1): uri = groups.get( server_group(servers[:initial_nodes], self.group_number, i)) self.rest.add_node(master, host_port, self.roles[host_port], uri) # Rebalance master = servers[0] known_nodes = servers[:initial_nodes] ejected_nodes = [] self.rest.rebalance(master, known_nodes, ejected_nodes) self.monitor.monitor_rebalance(master) def create_buckets(self, emptyBuckets=False): ram_quota = self.mem_quota / (self.test_config.cluster.num_buckets + self.test_config.cluster.emptybuckets) replica_number = self.test_config.bucket.replica_number replica_index = self.test_config.bucket.replica_index eviction_policy = self.test_config.bucket.eviction_policy threads_number = self.test_config.bucket.threads_number proxyPort = self.test_config.bucket.proxyPort password = self.test_config.bucket.password buckets = self.test_config.emptybuckets if emptyBuckets else self.test_config.buckets for master in self.masters(): for bucket_name in buckets: self.rest.create_bucket(host_port=master, name=bucket_name, ram_quota=ram_quota, replica_number=replica_number, replica_index=replica_index, eviction_policy=eviction_policy, threads_number=threads_number, password=password, proxyPort=proxyPort) def configure_auto_compaction(self): compaction_settings = self.test_config.compaction for master in self.masters(): self.rest.configure_auto_compaction(master, compaction_settings) def configure_internal_settings(self): internal_settings = self.test_config.internal_settings for master in self.masters(): for parameter, value in internal_settings.items(): self.rest.set_internal_settings(master, {parameter: int(value)}) def tweak_memory(self): self.remote.reset_swap() self.remote.drop_caches() self.remote.set_swappiness() self.remote.disable_thp() def restart_with_alternative_num_vbuckets(self): num_vbuckets = self.test_config.cluster.num_vbuckets if num_vbuckets is not None: self.remote.restart_with_alternative_num_vbuckets(num_vbuckets) def restart_with_alternative_bucket_options(self): cmd = 'ns_bucket:update_bucket_props("{}", ' \ '[{{extra_config_string, "{}={}"}}]).' for option in ('defragmenter_enabled', 'exp_pager_stime', 'ht_locks', 'max_num_shards', 'max_threads', 'warmup_min_memory_threshold', 'bfilter_enabled'): value = getattr(self.test_config.bucket, option) if value != -1 and value is not None: logger.info('Changing {} to {}'.format(option, value)) for master in self.masters(): for bucket in self.test_config.buckets: diag_eval = cmd.format(bucket, option, value) self.rest.run_diag_eval(master, diag_eval) self.remote.restart() def tune_logging(self): self.remote.tune_log_rotation() self.remote.restart() def restart_with_alternative_num_cpus(self): num_cpus = self.test_config.cluster.num_cpus if num_cpus: self.remote.restart_with_alternative_num_cpus(num_cpus) def restart_with_tcmalloc_aggressive_decommit(self): if self.test_config.cluster.tcmalloc_aggressive_decommit: self.remote.restart_with_tcmalloc_aggressive_decommit() def restart_with_sfwi(self): if self.test_config.cluster.sfwi: self.remote.restart_with_sfwi() def enable_auto_failover(self): for master in self.masters(): self.rest.enable_auto_failover(master) def wait_until_warmed_up(self): target_iterator = TargetIterator(self.cluster_spec, self.test_config) for target in target_iterator: self.monitor.monitor_warmup(self.memcached, target.node, target.bucket) def wait_until_healthy(self): for master in self.cluster_spec.yield_masters(): self.monitor.monitor_node_health(master) def change_watermarks(self): watermark_settings = self.test_config.watermark_settings for host_port, initial_nodes in zip(self.servers(), self.initial_nodes): host = host_port.split(':')[0] memcached_port = self.rest.get_memcached_port(host_port) for bucket in self.test_config.buckets: for key, val in watermark_settings.items(): val = self.memcached.calc_watermark(val, self.mem_quota) self.memcached.set_flusher_param(host, memcached_port, bucket, key, val) def start_cbq_engine(self): if self.test_config.cluster.run_cbq: self.remote.start_cbq()
class RemoteWorkerManager: WORKER_HOME = '/tmp/perfrunner' PING_INTERVAL = 1 def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig, verbose: bool): self.cluster_spec = cluster_spec self.test_config = test_config self.remote = RemoteHelper(cluster_spec, test_config, verbose) self.workers = cycle(self.cluster_spec.workers) self.terminate() self.start() self.wait_until_workers_are_ready() @property def is_remote(self) -> bool: return True def next_worker(self) -> str: return next(self.workers) def reset_workers(self): self.workers = cycle(self.cluster_spec.workers) def start(self): logger.info('Initializing remote worker environment') self.remote.init_repo(self.WORKER_HOME) for worker in self.cluster_spec.workers: logger.info( 'Starting remote Celery worker, host={}'.format(worker)) perfrunner_home = os.path.join(self.WORKER_HOME, 'perfrunner') self.remote.start_celery_worker(worker, perfrunner_home) def wait_until_workers_are_ready(self): workers = [ 'celery@{}'.format(worker) for worker in self.cluster_spec.workers ] while True: responses = celery.control.ping(workers) if len(responses) == len(workers): break time.sleep(self.PING_INTERVAL) logger.info('All remote Celery workers are ready') def run_tasks(self, task: Callable, task_settings: PhaseSettings, target_iterator: TargetIterator, timer: int = None): self.async_results = [] self.reset_workers() for target in target_iterator: for instance in range(task_settings.worker_instances): worker = self.next_worker() logger.info('Running the task on {}'.format(worker)) async_result = task.apply_async( args=(task_settings, target, timer, instance), queue=worker, expires=timer, ) self.async_results.append(async_result) def run_sg_tasks(self, task: Callable, task_settings: PhaseSettings, timer: int = None, distrubute: bool = False, phase: str = ""): self.async_results = [] self.reset_workers() if distrubute: total_threads = int(task_settings.syncgateway_settings.threads) total_clients = int(task_settings.syncgateway_settings.clients) instances_per_client = int( task_settings.syncgateway_settings.instances_per_client) total_instances = total_clients * instances_per_client threads_per_instance = int(total_threads / total_instances) or 1 worker_id = 0 for instance in range(instances_per_client): for client in self.cluster_spec.workers[:total_clients]: worker_id += 1 logger.info( 'Running the \'{}\' by worker #{} on client {}'.format( phase, worker_id, client)) task_settings.syncgateway_settings.threads_per_instance = str( threads_per_instance) async_result = task.apply_async( args=(task_settings, timer, worker_id, self.cluster_spec), queue=client, expires=timer, ) self.async_results.append(async_result) else: client = self.cluster_spec.workers[0] logger.info( 'Running sigle-instance task \'{}\' on client {}'.format( phase, client)) async_result = task.apply_async( args=(task_settings, timer, 0, self.cluster_spec), queue=client, expires=timer, ) self.async_results.append(async_result) def wait_for_workers(self): logger.info('Waiting for all tasks to finish') for async_result in self.async_results: async_result.get() logger.info('All tasks are done') def download_celery_logs(self): if not os.path.exists('celery'): os.mkdir('celery') self.remote.get_celery_logs(self.WORKER_HOME) def terminate(self): logger.info('Terminating Celery workers') self.remote.terminate_client_processes()
class RestoreHelper: def __init__(self, cluster_spec, test_config): self.cluster_spec = cluster_spec self.test_config = test_config self.snapshot = self.test_config.restore_settings.snapshot self.remote = RemoteHelper(self.cluster_spec) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def fetch_maps(self): rest = RestHelper(self.cluster_spec) master_node = next(self.cluster_spec.masters) maps = {} for bucket in self.test_config.buckets: vbmap = rest.get_vbmap(master_node, bucket) server_list = rest.get_server_list(master_node, bucket) maps[bucket] = (vbmap, server_list) return maps def cp(self, server, cmd): logger.info('Restoring files on {}'.format(server)) with settings(host_string=server, user=self.cluster_spec.ssh_credentials[0], password=self.cluster_spec.ssh_credentials[1]): run(cmd) def restore(self): maps = self.fetch_maps() self.remote.stop_server() threads = [] for bucket, (vbmap, server_list) in maps.items(): files = defaultdict(list) for vb_idx, nodes in enumerate(vbmap): for node_idx in nodes: files[server_list[node_idx]].append(vb_idx) for server, vbuckets in files.items(): cmd = 'cp ' for vbucket in vbuckets: cmd += '{}/{}.couch.1 '.format(self.snapshot, vbucket) cmd += '/data/{}'.format(bucket) threads.append(Thread(target=self.cp, args=(server, cmd))) for t in threads: t.start() time.sleep(1) for t in threads: t.join() state.connections.clear() self.remote.drop_caches() self.remote.start_server() def warmup(self): cm = ClusterManager(self.cluster_spec, self.test_config) cm.wait_until_warmed_up() cm.wait_until_healthy()
class PerfTest(object): COLLECTORS = {} def __init__(self, cluster_spec, test_config, experiment=None): self.cluster_spec = cluster_spec self.test_config = test_config self.target_iterator = TargetIterator(self.cluster_spec, self.test_config) self.memcached = MemcachedHelper(cluster_spec) self.monitor = Monitor(cluster_spec) self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec) if experiment: self.experiment = ExperimentHelper(experiment, cluster_spec, test_config) self.master_node = cluster_spec.yield_masters().next() self.build = self.rest.get_version(self.master_node) self.cbagent = CbAgent(self) self.metric_helper = MetricHelper(self) self.reporter = Reporter(self) self.reports = {} self.snapshots = [] self.master_events = [] if self.test_config.test_case.use_workers: self.worker_manager = WorkerManager(cluster_spec, test_config) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if self.test_config.test_case.use_workers: self.worker_manager.terminate() if exc_type != exc.KeyboardInterrupt: self.debug() for master in self.cluster_spec.yield_masters(): if not self.rest.is_balanced(master): logger.interrupt('Rebalance failed') num_failovers = self.rest.get_failover_counter(master) if hasattr(self, 'rebalance_settings'): if self.rebalance_settings.failover or \ self.rebalance_settings.graceful_failover: continue if num_failovers: logger.interrupt( 'Failover happened {} time(s)'.format(num_failovers) ) def compact_bucket(self): for master in self.cluster_spec.yield_masters(): for bucket in self.test_config.buckets: self.rest.trigger_bucket_compaction(master, bucket) for master in self.cluster_spec.yield_masters(): self.monitor.monitor_task(master, 'bucket_compaction') def wait_for_persistence(self): for master in self.cluster_spec.yield_masters(): for bucket in self.test_config.buckets: self.monitor.monitor_disk_queue(master, bucket) self.monitor.monitor_tap_replication(master, bucket) def load(self): load_settings = self.test_config.load_settings log_phase('load phase', load_settings) self.worker_manager.run_workload(load_settings, self.target_iterator) self.worker_manager.wait_for_workers() def hot_load(self): hot_load_settings = self.test_config.hot_load_settings log_phase('hot load phase', hot_load_settings) self.worker_manager.run_workload(hot_load_settings, self.target_iterator) self.worker_manager.wait_for_workers() def access(self): access_settings = self.test_config.access_settings log_phase('access phase', access_settings) self.worker_manager.run_workload(access_settings, self.target_iterator) self.worker_manager.wait_for_workers() def access_bg(self): access_settings = self.test_config.access_settings log_phase('access in background', access_settings) self.worker_manager.run_workload(access_settings, self.target_iterator, timer=access_settings.time) def access_bg_with_ddocs(self): access_settings = self.test_config.access_settings log_phase('access phase', access_settings) index_type = self.test_config.index_settings.index_type self.worker_manager.run_workload(access_settings, self.target_iterator, timer=access_settings.time, ddocs=self.ddocs, index_type=index_type) def timer(self): access_settings = self.test_config.access_settings logger.info('Running phase for {} seconds'.format(access_settings.time)) time.sleep(access_settings.time) def debug(self): self.remote.collect_info() for hostname in self.cluster_spec.yield_hostnames(): for fname in glob.glob('{}/*.zip'.format(hostname)): shutil.move(fname, '{}.zip'.format(hostname)) self.reporter.save_web_logs()
def __init__(self, cluster_spec: ClusterSpec): super().__init__(cluster_spec=cluster_spec) self.remote = RemoteHelper(cluster_spec) self.ip_table, self.port_translation = self.remote.get_ip_port_mapping( )
def __init__(self, cluster_spec, options): self.options = options self.cluster_spec = cluster_spec self.operator_version = self.options.operator_version if "-" in self.operator_version: self.operator_release = self.operator_version.split("-")[0] self.operator_tag = 'registry.gitlab.com/cb-vanilla/operator:{}'\ .format(self.operator_version) self.admission_controller_release = self.operator_version.split("-")[0] self.admission_controller_tag = \ 'registry.gitlab.com/cb-vanilla/admission-controller:{}' \ .format(self.operator_version) else: self.operator_release = self.operator_version self.operator_tag = 'couchbase/operator:{}'\ .format(self.operator_version) self.admission_controller_release = self.operator_version self.admission_controller_tag = 'couchbase/admission-controller:{}' \ .format(self.operator_version) self.couchbase_version = self.options.couchbase_version if "-" in self.couchbase_version: self.couchbase_release = self.couchbase_version.split("-")[0] self.couchbase_tag = 'registry.gitlab.com/cb-vanilla/server:{}'\ .format(self.couchbase_version) else: self.couchbase_release = self.couchbase_version self.couchbase_tag = 'couchbase/server:{}'\ .format(self.couchbase_version) self.operator_backup_version = self.options.operator_backup_version if self.operator_backup_version: if "-" in self.operator_backup_version: self.operator_backup_release = self.operator_backup_version.split("-")[0] self.operator_backup_tag = 'registry.gitlab.com/cb-vanilla/operator-backup:{}'\ .format(self.operator_backup_version) else: self.operator_backup_release = self.operator_backup_version self.operator_backup_tag = 'couchbase/operator-backup/{}'\ .format(self.operator_backup_version) else: self.operator_backup_tag = 'registry.gitlab.com/cb-vanilla/operator-backup:latest' self.node_count = len(self.cluster_spec.infrastructure_clusters['couchbase1'].split()) self.remote = RemoteHelper(cluster_spec) self.docker_config_path = os.path.expanduser("~") + "/.docker/config.json" self.operator_base_path = "cloud/operator/{}/{}"\ .format(self.operator_release.split(".")[0], self.operator_release.split(".")[1]) self.certificate_authority_path = "{}/ca.crt"\ .format(self.operator_base_path) self.crd_path = "{}/crd.yaml"\ .format(self.operator_base_path) self.config_path = "{}/config.yaml"\ .format(self.operator_base_path) self.config_template_path = "{}/config_template.yaml"\ .format(self.operator_base_path) self.auth_path = "{}/auth_secret.yaml"\ .format(self.operator_base_path) self.cb_cluster_path = "{}/couchbase-cluster.yaml"\ .format(self.operator_base_path) self.template_cb_cluster_path = "{}/couchbase-cluster_template.yaml"\ .format(self.operator_base_path) self.worker_base_path = "cloud/worker" self.worker_path = "{}/worker.yaml"\ .format(self.worker_base_path) self.rmq_base_path = "cloud/broker/rabbitmq/0.48" self.rmq_operator_path = "{}/cluster-operator.yaml"\ .format(self.rmq_base_path) self.rmq_cluster_path = "{}/rabbitmq.yaml"\ .format(self.rmq_base_path)
class CouchbaseInstaller(object): CBFS = 'http://cbfs-ext.hq.couchbase.com/builds/' LATEST_BUILDS = 'http://latestbuilds.hq.couchbase.com/' SHERLOCK_BUILDS = '' def __init__(self, cluster_spec, options): self.remote = RemoteHelper(cluster_spec, None, options.verbose) self.cluster_spec = cluster_spec arch = self.remote.detect_arch() pkg = self.remote.detect_pkg() release, build = options.version.split('-') self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/sherlock/{}/'.format(build) self.build = Build(arch, pkg, options.version, release, build, options.toy) logger.info('Target build info: {}'.format(self.build)) def get_expected_filenames(self): patterns = () # Sentinel if self.build.toy: patterns = ( 'couchbase-server-community_toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_toy-{toy}-{version}-toy_{arch}.{pkg}', 'couchbase-server-community_cent58-2.5.2-toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_cent58-3.0.0-toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_cent64-3.0.0-toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_cent64-3.0.1-toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_cent58-master-toy-{toy}-{arch}_{version}-toy.{pkg}', 'couchbase-server-community_cent54-master-toy-{toy}-{arch}_{version}-toy.{pkg}', ) elif self.build.pkg == 'rpm': patterns = ( 'couchbase-server-enterprise_centos6_{arch}_{version}-rel.{pkg}', 'couchbase-server-enterprise-{version}-centos6.{arch}.{pkg}', 'couchbase-server-enterprise_{arch}_{version}-rel.{pkg}', ) elif self.build.pkg == 'deb': patterns = ( 'couchbase-server-enterprise_ubuntu_1204_{arch}_{version}-rel.{pkg}', 'couchbase-server-enterprise_{version}-ubuntu12.04_amd64.{pkg}', 'couchbase-server-enterprise_{arch}_{version}-rel.{pkg}', ) elif self.build.pkg == 'exe': patterns = ( 'couchbase-server-enterprise_{arch}_{version}-rel.setup.{pkg}', 'couchbase_server-enterprise-windows-amd64-{version}.{pkg}', 'couchbase_server/{release}/{build}/couchbase_server-enterprise-windows-amd64-{version}.exe', ) for pattern in patterns: yield pattern.format(**self.build._asdict()) def find_package(self): for filename in self.get_expected_filenames(): for base in (self.LATEST_BUILDS, self.SHERLOCK_BUILDS, self.CBFS): url = '{}{}'.format(base, filename) try: status_code = requests.head(url).status_code except ConnectionError: continue else: if status_code == 200: logger.info('Found "{}"'.format(url)) return filename, url logger.interrupt('Target build not found') def kill_processes(self): self.remote.kill_processes() def uninstall_package(self): self.remote.uninstall_couchbase(self.build.pkg) def clean_data(self): self.remote.clean_data() def install_package(self): filename, url = self.find_package() self.remote.install_couchbase(self.build.pkg, url, filename, self.build.release) def install(self): self.kill_processes() self.uninstall_package() self.clean_data() self.install_package()
def __init__(self, cluster_spec, test_config, options): self.remote_helper = RemoteHelper(cluster_spec) self.cluster_spec = cluster_spec self.test_config = test_config self.pkg = self.remote_helper.detect_pkg() self.version = options.version
class ClusterManager: def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig, verbose: bool = False): self.cluster_spec = cluster_spec self.test_config = test_config self.dynamic_infra = self.cluster_spec.dynamic_infrastructure self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec, verbose) self.monitor = Monitor(cluster_spec, test_config, verbose) self.memcached = MemcachedHelper(test_config) self.master_node = next(self.cluster_spec.masters) self.initial_nodes = test_config.cluster.initial_nodes self.build = self.rest.get_version(self.master_node) def is_compatible(self, min_release: str) -> bool: for master in self.cluster_spec.masters: version = self.rest.get_version(master) return version >= min_release def set_data_path(self): if self.dynamic_infra: return for server in self.cluster_spec.servers: self.remote.change_owner(server, self.cluster_spec.data_path) self.rest.set_data_path(server, self.cluster_spec.data_path) def set_index_path(self): if self.dynamic_infra: return for server in self.cluster_spec.servers: self.remote.change_owner(server, self.cluster_spec.index_path) self.rest.set_index_path(server, self.cluster_spec.index_path) def set_analytics_path(self): if self.dynamic_infra: return paths = [] for path in self.cluster_spec.analytics_paths: for i in range(self.test_config.analytics_settings.num_io_devices): io_device = '{}/dev{}'.format(path, i) paths.append(io_device) for server in self.cluster_spec.servers_by_role('cbas'): for path in self.cluster_spec.analytics_paths: self.remote.change_owner(server, path) self.rest.set_analytics_paths(server, paths) def rename(self): if self.dynamic_infra: return else: for server in self.cluster_spec.servers: self.rest.rename(server) def set_auth(self): if self.dynamic_infra: return else: for server in self.cluster_spec.servers: self.rest.set_auth(server) def set_mem_quotas(self): if self.dynamic_infra: cluster = self.remote.get_cluster() cluster['spec']['cluster']['dataServiceMemoryQuota'] = \ '{}Mi'.format(self.test_config.cluster.mem_quota) cluster['spec']['cluster']['indexServiceMemoryQuota'] = \ '{}Mi'.format(self.test_config.cluster.index_mem_quota) if self.test_config.cluster.fts_index_mem_quota: cluster['spec']['cluster']['searchServiceMemoryQuota'] = \ '{}Mi'.format(self.test_config.cluster.fts_index_mem_quota) if self.test_config.cluster.analytics_mem_quota: cluster['spec']['cluster']['analyticsServiceMemoryQuota'] = \ '{}Mi'.format(self.test_config.cluster.analytics_mem_quota) if self.test_config.cluster.eventing_mem_quota: cluster['spec']['cluster']['eventingServiceMemoryQuota'] = \ '{}Mi'.format(self.test_config.cluster.eventing_mem_quota) self.remote.update_cluster_config(cluster) else: for master in self.cluster_spec.masters: self.rest.set_mem_quota(master, self.test_config.cluster.mem_quota) self.rest.set_index_mem_quota( master, self.test_config.cluster.index_mem_quota) if self.test_config.cluster.fts_index_mem_quota: self.rest.set_fts_index_mem_quota( master, self.test_config.cluster.fts_index_mem_quota) if self.test_config.cluster.analytics_mem_quota: self.rest.set_analytics_mem_quota( master, self.test_config.cluster.analytics_mem_quota) if self.test_config.cluster.eventing_mem_quota: self.rest.set_eventing_mem_quota( master, self.test_config.cluster.eventing_mem_quota) def set_query_settings(self): logger.info('Setting query settings') if self.dynamic_infra: return query_nodes = self.cluster_spec.servers_by_role('n1ql') if query_nodes: settings = self.test_config.n1ql_settings.cbq_settings if settings: self.rest.set_query_settings(query_nodes[0], settings) settings = self.rest.get_query_settings(query_nodes[0]) settings = pretty_dict(settings) logger.info('Query settings: {}'.format(settings)) def set_index_settings(self): logger.info('Setting index settings') index_nodes = self.cluster_spec.servers_by_role('index') if index_nodes: settings = self.test_config.gsi_settings.settings if settings: if self.dynamic_infra: cluster = self.remote.get_cluster() cluster['spec']['cluster']['indexStorageSetting'] = \ settings['indexer.settings.storage_mode'] self.remote.update_cluster_config(cluster, timeout=300, reboot=True) logger.info('Index settings: {}'.format(settings)) else: self.rest.set_index_settings(index_nodes[0], settings) settings = self.rest.get_index_settings(index_nodes[0]) settings = pretty_dict(settings) logger.info('Index settings: {}'.format(settings)) def set_services(self): if self.dynamic_infra: cluster = self.remote.get_cluster() server_types = dict() server_roles = self.cluster_spec.roles for server, role in server_roles.items(): role = role\ .replace('kv', 'data')\ .replace('n1ql', 'query') server_type_count = server_types.get(role, 0) server_types[role] = server_type_count + 1 istio = 'false' if self.cluster_spec.istio_enabled(cluster_name='k8s_cluster_1'): istio = 'true' cluster_servers = [] operator_version = self.remote.get_operator_version() operator_major = int(operator_version.split(".")[0]) operator_minor = int(operator_version.split(".")[1]) for server_role, server_role_count in server_types.items(): node_selector = { '{}_enabled'.format( service.replace('data', 'kv').replace('query', 'n1ql')): 'true' for service in server_role.split(",") } node_selector['NodeRoles'] = 'couchbase1' spec = { 'imagePullSecrets': [{ 'name': 'regcred' }], 'nodeSelector': node_selector, } if (operator_major, operator_minor) <= (2, 1): spec['containers'] = [] pod_def =\ { 'spec': spec, 'metadata': { 'annotations': {'sidecar.istio.io/inject': istio} } } server_def = \ { 'name': server_role.replace(",", "-"), 'services': server_role.split(","), 'pod': pod_def, 'size': server_role_count, 'volumeMounts': {'default': 'couchbase_kv'} } cluster_servers.append(server_def) cluster['spec']['servers'] = cluster_servers self.remote.update_cluster_config(cluster, timeout=300, reboot=True) else: if not self.is_compatible(min_release='4.0.0'): return for master in self.cluster_spec.masters: roles = self.cluster_spec.roles[master] self.rest.set_services(master, roles) def add_nodes(self): if self.dynamic_infra: return for (_, servers), initial_nodes \ in zip(self.cluster_spec.clusters, self.initial_nodes): if initial_nodes < 2: # Single-node cluster continue master = servers[0] for node in servers[1:initial_nodes]: roles = self.cluster_spec.roles[node] self.rest.add_node(master, node, roles) def rebalance(self): if self.dynamic_infra: return for (_, servers), initial_nodes \ in zip(self.cluster_spec.clusters, self.initial_nodes): master = servers[0] known_nodes = servers[:initial_nodes] ejected_nodes = [] self.rest.rebalance(master, known_nodes, ejected_nodes) self.monitor.monitor_rebalance(master) self.wait_until_healthy() def increase_bucket_limit(self, num_buckets: int): if self.dynamic_infra: return for master in self.cluster_spec.masters: self.rest.increase_bucket_limit(master, num_buckets) def flush_buckets(self): for master in self.cluster_spec.masters: for bucket_name in self.test_config.buckets: self.rest.flush_bucket(host=master, bucket=bucket_name) def delete_buckets(self): for master in self.cluster_spec.masters: for bucket_name in self.test_config.buckets: self.rest.delete_bucket(host=master, name=bucket_name) def create_buckets(self): mem_quota = self.test_config.cluster.mem_quota if self.test_config.cluster.num_buckets > 7: self.increase_bucket_limit(self.test_config.cluster.num_buckets + 3) if self.test_config.cluster.eventing_metadata_bucket_mem_quota: mem_quota -= ( self.test_config.cluster.eventing_metadata_bucket_mem_quota + self.test_config.cluster.eventing_bucket_mem_quota) per_bucket_quota = mem_quota // self.test_config.cluster.num_buckets if self.dynamic_infra: self.remote.delete_all_buckets() for bucket_name in self.test_config.buckets: self.remote.create_bucket(bucket_name, per_bucket_quota, self.test_config.bucket) else: if self.test_config.bucket.backend_storage == 'magma': self.enable_developer_preview() for master in self.cluster_spec.masters: for bucket_name in self.test_config.buckets: self.rest.create_bucket( host=master, name=bucket_name, ram_quota=per_bucket_quota, password=self.test_config.bucket.password, replica_number=self.test_config.bucket.replica_number, replica_index=self.test_config.bucket.replica_index, eviction_policy=self.test_config.bucket. eviction_policy, bucket_type=self.test_config.bucket.bucket_type, backend_storage=self.test_config.bucket. backend_storage, conflict_resolution_type=self.test_config.bucket. conflict_resolution_type, compression_mode=self.test_config.bucket. compression_mode, ) def create_collections(self): if self.dynamic_infra: return collection_map = self.test_config.collection.collection_map for master in self.cluster_spec.masters: if collection_map is not None: if self.test_config.collection.use_bulk_api: for bucket in collection_map.keys(): create_scopes = [] for scope in collection_map[bucket]: scope_collections = [] for collection in collection_map[bucket][scope]: scope_collections.append({"name": collection}) create_scopes.append({ "name": scope, "collections": scope_collections }) self.rest.set_collection_map(master, bucket, {"scopes": create_scopes}) else: for bucket in collection_map.keys(): delete_default = True for scope in collection_map[bucket]: if scope == '_default': for collection in collection_map[bucket][ scope]: if collection == "_default": delete_default = False if delete_default: self.rest.delete_collection( master, bucket, '_default', '_default') for bucket in collection_map.keys(): for scope in collection_map[bucket]: if scope != '_default': self.rest.create_scope(master, bucket, scope) for collection in collection_map[bucket][scope]: if collection != '_default': self.rest.create_collection( master, bucket, scope, collection) def create_eventing_buckets(self): if not self.test_config.cluster.eventing_bucket_mem_quota: return if self.dynamic_infra: return per_bucket_quota = \ self.test_config.cluster.eventing_bucket_mem_quota \ // self.test_config.cluster.eventing_buckets for master in self.cluster_spec.masters: for bucket_name in self.test_config.eventing_buckets: self.rest.create_bucket( host=master, name=bucket_name, ram_quota=per_bucket_quota, password=self.test_config.bucket.password, replica_number=self.test_config.bucket.replica_number, replica_index=self.test_config.bucket.replica_index, eviction_policy=self.test_config.bucket.eviction_policy, bucket_type=self.test_config.bucket.bucket_type, conflict_resolution_type=self.test_config.bucket. conflict_resolution_type, ) def create_eventing_metadata_bucket(self): if not self.test_config.cluster.eventing_metadata_bucket_mem_quota: return if self.dynamic_infra: return for master in self.cluster_spec.masters: self.rest.create_bucket( host=master, name=self.test_config.cluster.EVENTING_METADATA_BUCKET_NAME, ram_quota=self.test_config.cluster. eventing_metadata_bucket_mem_quota, password=self.test_config.bucket.password, replica_number=self.test_config.bucket.replica_number, replica_index=self.test_config.bucket.replica_index, eviction_policy=self.test_config.bucket.EVICTION_POLICY, bucket_type=self.test_config.bucket.BUCKET_TYPE, ) def configure_auto_compaction(self): compaction_settings = self.test_config.compaction if self.dynamic_infra: cluster = self.remote.get_cluster() db = int(compaction_settings.db_percentage) view = int(compaction_settings.view_percentage) para = bool(str(compaction_settings.parallel).lower()) auto_compaction = cluster['spec']['cluster']\ .get('autoCompaction', {'databaseFragmentationThreshold': {'percent': 30}, 'viewFragmentationThreshold': {'percent': 30}, 'parallelCompaction': False}) db_percent = auto_compaction.get('databaseFragmentationThreshold', {'percent': 30}) db_percent['percent'] = db auto_compaction['databaseFragmentationThreshold'] = db_percent views_percent = auto_compaction.get('viewFragmentationThreshold', {'percent': 30}) views_percent['percent'] = view auto_compaction['viewFragmentationThreshold'] = views_percent auto_compaction['parallelCompaction'] = para self.remote.update_cluster_config(cluster) else: for master in self.cluster_spec.masters: self.rest.configure_auto_compaction(master, compaction_settings) settings = self.rest.get_auto_compaction_settings(master) logger.info('Auto-compaction settings: {}'.format( pretty_dict(settings))) def configure_internal_settings(self): internal_settings = self.test_config.internal_settings for master in self.cluster_spec.masters: for parameter, value in internal_settings.items(): if self.dynamic_infra: raise Exception( 'not supported for dynamic infrastructure yet') else: self.rest.set_internal_settings( master, {parameter: maybe_atoi(value)}) def configure_xdcr_settings(self): xdcr_cluster_settings = self.test_config.xdcr_cluster_settings if self.dynamic_infra: return for master in self.cluster_spec.masters: for parameter, value in xdcr_cluster_settings.items(): self.rest.set_xdcr_cluster_settings( master, {parameter: maybe_atoi(value)}) def tweak_memory(self): if self.dynamic_infra: return self.remote.reset_swap() self.remote.drop_caches() self.remote.set_swappiness() self.remote.disable_thp() def enable_n2n_encryption(self): if self.dynamic_infra: return if self.test_config.cluster.enable_n2n_encryption: for master in self.cluster_spec.masters: self.remote.enable_n2n_encryption( master, self.test_config.cluster.enable_n2n_encryption) def restart_with_alternative_num_vbuckets(self): num_vbuckets = self.test_config.cluster.num_vbuckets if num_vbuckets is not None: if self.dynamic_infra: raise Exception('not supported for dynamic infrastructure yet') else: self.remote.restart_with_alternative_num_vbuckets(num_vbuckets) def restart_with_alternative_bucket_options(self): """Apply custom buckets settings. Tune bucket settings (e.g., max_num_shards or max_num_auxio) using "/diag/eval" and restart the entire cluster. """ if self.dynamic_infra: return if self.test_config.bucket_extras: self.remote.enable_nonlocal_diag_eval() cmd = 'ns_bucket:update_bucket_props("{}", ' \ '[{{extra_config_string, "{}={}"}}]).' for option, value in self.test_config.bucket_extras.items(): if re.search("^num_.*_threads$", option): self.rest.set_num_threads(self.master_node, option, value) else: logger.info('Changing {} to {}'.format(option, value)) for master in self.cluster_spec.masters: for bucket in self.test_config.buckets: diag_eval = cmd.format(bucket, option, value) self.rest.run_diag_eval(master, diag_eval) if self.test_config.bucket_extras: self.disable_auto_failover() self.remote.restart() self.wait_until_healthy() self.enable_auto_failover() def tune_logging(self): if self.dynamic_infra: return self.remote.tune_log_rotation() self.remote.restart() def enable_auto_failover(self): enabled = self.test_config.bucket.autofailover_enabled failover_min = self.test_config.bucket.failover_min failover_max = self.test_config.bucket.failover_max if self.dynamic_infra: cluster = self.remote.get_cluster() cluster['spec']['cluster']['autoFailoverMaxCount'] = 1 cluster['spec']['cluster']['autoFailoverServerGroup'] = bool( enabled) cluster['spec']['cluster']['autoFailoverOnDataDiskIssues'] = bool( enabled) cluster['spec']['cluster']['autoFailoverOnDataDiskIssuesTimePeriod'] = \ '{}s'.format(10) cluster['spec']['cluster']['autoFailoverTimeout'] = \ '{}s'.format(failover_max) self.remote.update_cluster_config(cluster) else: for master in self.cluster_spec.masters: self.rest.set_auto_failover(master, enabled, failover_min, failover_max) def disable_auto_failover(self): enabled = 'false' failover_min = self.test_config.bucket.failover_min failover_max = self.test_config.bucket.failover_max if self.dynamic_infra: cluster = self.remote.get_cluster() cluster['spec']['cluster']['autoFailoverMaxCount'] = 1 cluster['spec']['cluster']['autoFailoverServerGroup'] = bool( enabled) cluster['spec']['cluster']['autoFailoverOnDataDiskIssues'] = bool( enabled) cluster['spec']['cluster']['autoFailoverOnDataDiskIssuesTimePeriod'] = \ '{}s'.format(10) cluster['spec']['cluster']['autoFailoverTimeout'] = \ '{}s'.format(failover_max) self.remote.update_cluster_config(cluster) else: for master in self.cluster_spec.masters: self.rest.set_auto_failover(master, enabled, failover_min, failover_max) def wait_until_warmed_up(self): if self.test_config.bucket.bucket_type in ('ephemeral', 'memcached'): return if self.dynamic_infra: self.remote.wait_for_cluster_ready() else: for master in self.cluster_spec.masters: for bucket in self.test_config.buckets: self.monitor.monitor_warmup(self.memcached, master, bucket) def wait_until_healthy(self): if self.dynamic_infra: self.remote.wait_for_cluster_ready() else: for master in self.cluster_spec.masters: self.monitor.monitor_node_health(master) for analytics_node in self.rest.get_active_nodes_by_role( master, 'cbas'): self.monitor.monitor_analytics_node_active(analytics_node) def gen_disabled_audit_events(self, master: str) -> List[str]: curr_settings = self.rest.get_audit_settings(master) curr_disabled = {str(event) for event in curr_settings['disabled']} disabled = curr_disabled - self.test_config.audit_settings.extra_events return list(disabled) def enable_audit(self): if self.dynamic_infra: return if not self.is_compatible(min_release='4.0.0') or \ self.rest.is_community(self.master_node): return if not self.test_config.audit_settings.enabled: return for master in self.cluster_spec.masters: disabled = [] if self.test_config.audit_settings.extra_events: disabled = self.gen_disabled_audit_events(master) self.rest.enable_audit(master, disabled) def generate_ce_roles(self) -> List[str]: return ['admin'] def generate_ee_roles(self) -> List[str]: existing_roles = { r['role'] for r in self.rest.get_rbac_roles(self.master_node) } roles = [] for role in ( 'bucket_admin', 'data_dcp_reader', 'data_monitoring', 'data_reader_writer', 'data_reader', 'data_writer', 'fts_admin', 'fts_searcher', 'query_delete', 'query_insert', 'query_select', 'query_update', 'views_admin', ): if role in existing_roles: roles.append(role + '[{bucket}]') return roles def delete_rbac_users(self): if not self.is_compatible(min_release='5.0'): return for master in self.cluster_spec.masters: for bucket in self.test_config.buckets: self.rest.delete_rbac_user(host=master, bucket=bucket) def add_rbac_users(self): if self.dynamic_infra: self.remote.create_from_file( "cloud/operator/2/1/user-password-secret.yaml") # self.remote.create_from_file("cloud/operator/2/1/admin-user.yaml") self.remote.create_from_file("cloud/operator/2/1/bucket-user.yaml") self.remote.create_from_file( "cloud/operator/2/1/rbac-admin-group.yaml") self.remote.create_from_file( "cloud/operator/2/1/rbac-admin-role-binding.yaml") else: if not self.rest.supports_rbac(self.master_node): logger.info('RBAC not supported - skipping adding RBAC users') return if self.rest.is_community(self.master_node): roles = self.generate_ce_roles() else: roles = self.generate_ee_roles() for master in self.cluster_spec.masters: admin_user, admin_password = self.cluster_spec.rest_credentials self.rest.add_rbac_user( host=master, user=admin_user, password=admin_password, roles=['admin'], ) buckets = self.test_config.buckets + self.test_config.eventing_buckets for bucket in buckets: bucket_roles = [ role.format(bucket=bucket) for role in roles ] bucket_roles.append("admin") self.rest.add_rbac_user( host=master, user=bucket, # Backward compatibility password=self.test_config.bucket.password, roles=bucket_roles, ) def add_extra_rbac_users(self, num_users): if not self.rest.supports_rbac(self.master_node): logger.info('RBAC not supported - skipping adding RBAC users') return if self.rest.is_community(self.master_node): roles = self.generate_ce_roles() else: roles = self.generate_ee_roles() for master in self.cluster_spec.masters: admin_user, admin_password = self.cluster_spec.rest_credentials self.rest.add_rbac_user( host=master, user=admin_user, password=admin_password, roles=['admin'], ) for bucket in self.test_config.buckets: bucket_roles = [role.format(bucket=bucket) for role in roles] bucket_roles.append("admin") for i in range(1, num_users + 1): user = '******'.format(user_number=str(i)) self.rest.add_rbac_user( host=master, user=user, password=self.test_config.bucket.password, roles=bucket_roles, ) def throttle_cpu(self): if self.dynamic_infra: cluster = self.remote.get_cluster() if self.test_config.cluster.enable_cpu_cores: server_groups = cluster['spec']['servers'] updated_server_groups = [] default_cpu = 80 for server_group in server_groups: resources = server_group.get('resources', {}) limits = resources.get('limits', {}) limits['cpu'] = default_cpu resources['limits'] = limits server_group['resources'] = resources updated_server_groups.append(server_group) cluster['spec']['servers'] = updated_server_groups if self.test_config.cluster.online_cores: server_groups = cluster['spec']['servers'] updated_server_groups = [] online_vcpus = self.test_config.cluster.online_cores * 2 for server_group in server_groups: resources = server_group.get('resources', {}) limits = resources.get('limits', {}) limits['cpu'] = online_vcpus resources['limits'] = limits server_group['resources'] = resources updated_server_groups.append(server_group) cluster['spec']['servers'] = updated_server_groups self.remote.update_cluster_config(cluster, timeout=300, reboot=True) else: if self.remote.os == 'Cygwin': return if self.test_config.cluster.enable_cpu_cores: self.remote.enable_cpu() if self.test_config.cluster.online_cores: self.remote.disable_cpu(self.test_config.cluster.online_cores) def tune_memory_settings(self): kernel_memory = self.test_config.cluster.kernel_mem_limit if kernel_memory: if self.dynamic_infra: cluster = self.remote.get_cluster() server_groups = cluster['spec']['servers'] tune_services = set() # CAO uses different service names than perfrunner for service in self.test_config.cluster.kernel_mem_limit_services: if service == 'kv': service = 'data' elif service == 'n1ql': service = 'query' elif service == 'fts': service = 'search' elif service == 'cbas': service = 'analytics' tune_services.add(service) updated_server_groups = [] default_mem = '128Gi' for server_group in server_groups: services_in_group = set(server_group['services']) resources = server_group.get('resources', {}) limits = resources.get('limits', {}) mem_limit = limits.get('memory', default_mem) if services_in_group.intersection( tune_services) and kernel_memory != 0: mem_limit = '{}Mi'.format(kernel_memory) limits['memory'] = mem_limit resources['limits'] = limits server_group['resources'] = resources updated_server_groups.append(server_group) cluster['spec']['servers'] = updated_server_groups self.remote.update_cluster_config(cluster, timeout=300, reboot=True) else: for service in self.test_config.cluster.kernel_mem_limit_services: for server in self.cluster_spec.servers_by_role(service): self.remote.tune_memory_settings(host_string=server, size=kernel_memory) self.monitor.wait_for_servers() def reset_memory_settings(self): if self.dynamic_infra: return for service in self.test_config.cluster.kernel_mem_limit_services: for server in self.cluster_spec.servers_by_role(service): self.remote.reset_memory_settings(host_string=server) self.monitor.wait_for_servers() def flush_iptables(self): if self.dynamic_infra: return self.remote.flush_iptables() def clear_login_history(self): if self.dynamic_infra: return self.remote.clear_wtmp() def disable_wan(self): if self.dynamic_infra: return self.remote.disable_wan() def enable_ipv6(self): if self.dynamic_infra: return if self.test_config.cluster.ipv6: version, build_number = self.build.split('-') build = tuple(map(int, version.split('.'))) + (int(build_number), ) if build < (6, 5, 0, 0): self.remote.update_ip_family_rest() else: self.remote.update_ip_family_cli() self.remote.enable_ipv6() def set_x509_certificates(self): if self.dynamic_infra: return logger.info('Setting x509 settings') if self.test_config.access_settings.ssl_mode == "auth": self.remote.setup_x509() for host in self.cluster_spec.servers: self.rest.upload_cluster_certificate(host) for host in self.cluster_spec.servers: self.rest.reload_cluster_certificate(host) self.rest.enable_certificate_auth(host) def set_cipher_suite(self): if self.dynamic_infra: return if self.test_config.access_settings.cipher_list: check_cipher_suit = self.rest.get_cipher_suite(self.master_node) logger.info('current cipher suit: {}'.format(check_cipher_suit)) self.rest.set_cipher_suite( self.master_node, self.test_config.access_settings.cipher_list) check_cipher_suit = self.rest.get_cipher_suite(self.master_node) logger.info('new cipher suit: {}'.format(check_cipher_suit)) def set_min_tls_version(self): if self.dynamic_infra: return if self.test_config.access_settings.min_tls_version: check_tls_version = self.rest.get_minimum_tls_version( self.master_node) logger.info('current tls version: {}'.format(check_tls_version)) self.rest.set_minimum_tls_version( self.master_node, self.test_config.access_settings.min_tls_version) check_tls_version = self.rest.get_minimum_tls_version( self.master_node) logger.info('new tls version: {}'.format(check_tls_version)) def get_debug_rpm_url(self): release, build_number = self.build.split('-') build = tuple(map(int, release.split('.'))) + (int(build_number), ) if build > (7, 0, 0, 0): release = 'cheshire-cat' elif build > (6, 5, 0, 0) and build < (7, 0, 0, 0): release = 'mad-hatter' elif build < (6, 5, 0, 0): release = 'alice' centos_version = self.remote.detect_centos_release() rpm_url = 'http://latestbuilds.service.couchbase.com/builds/' \ 'latestbuilds/couchbase-server/{}/{}/' \ 'couchbase-server-enterprise-debuginfo-{}-centos{}.x86_64.rpm' \ ''.format(release, build_number, self.build, centos_version) return rpm_url def install_cb_debug_rpm(self): self.remote.install_cb_debug_rpm(url=self.get_debug_rpm_url()) def enable_developer_preview(self): release, build_number = self.build.split('-') build = tuple(map(int, release.split('.'))) + (int(build_number), ) if build > (7, 0, 0, 4698) or build < (0, 0, 0, 9999): self.remote.enable_developer_preview()
class CouchbaseInstaller: def __init__(self, cluster_spec, options): self.remote = RemoteHelper(cluster_spec, options.verbose) self.options = options @property def url(self) -> str: if validators.url(self.options.version): return self.options.version else: return self.find_package(edition=self.options.edition) @property def release(self) -> str: return self.options.version.split('-')[0] @property def build(self) -> str: split = self.options.version.split('-') if len(split) > 1: return split[1] def find_package(self, edition: str) -> [str, str]: for url in self.url_iterator(edition): if self.is_exist(url): return url logger.interrupt('Target build not found') def url_iterator(self, edition: str) -> Iterator[str]: os_release = None if self.remote.package == 'rpm': os_release = self.remote.detect_centos_release() elif self.remote.package == 'deb': os_release = self.remote.detect_ubuntu_release() for pkg_pattern in PKG_PATTERNS[self.remote.package]: for loc_pattern in LOCATIONS: url = loc_pattern + pkg_pattern yield url.format(release=self.release, build=self.build, edition=edition, os=os_release) @staticmethod def is_exist(url): try: status_code = requests.head(url).status_code except ConnectionError: return False if status_code == 200: return True return False def download(self): """Download and save a copy of the specified package.""" if self.remote.package == 'rpm': logger.info('Saving a local copy of {}'.format(self.url)) with open('couchbase.rpm', 'wb') as fh: resp = requests.get(self.url) fh.write(resp.content) else: logger.interrupt('Unsupported package format') def kill_processes(self): self.remote.kill_processes() def uninstall_package(self): self.remote.uninstall_couchbase() def clean_data(self): self.remote.clean_data() def install_package(self): logger.info('Using this URL: {}'.format(self.url)) self.remote.upload_iss_files(self.release) self.remote.install_couchbase(self.url) def install(self): self.kill_processes() self.uninstall_package() self.clean_data() self.install_package()
class CouchbaseInstaller(object): def __init__(self, cluster_spec, options): self.options = options self.remote = RemoteHelper(cluster_spec, None, options.verbose) self.cluster_spec = cluster_spec arch = self.remote.detect_arch() pkg = self.remote.detect_pkg() release = None build = None if options.version: release, build = options.version.split('-') self.build = Build(arch, pkg, options.cluster_edition, options.version, release, build, options.url) logger.info('Target build info: {}'.format(self.build)) def get_expected_filenames(self): if self.build.pkg == 'rpm': os_release = self.remote.detect_os_release() patterns = ( 'couchbase-server-{{edition}}-{{version}}-centos{}.{{arch}}.{{pkg}}'.format(os_release), 'couchbase-server-{edition}_centos6_{arch}_{version}-rel.{pkg}', ) elif self.build.pkg == 'deb': patterns = ( 'couchbase-server-{edition}_{arch}_{version}-rel.{pkg}', 'couchbase-server-{edition}_{version}-ubuntu12.04_{arch}.{pkg}', ) elif self.build.pkg == 'exe': patterns = ( 'couchbase-server-{edition}_{arch}_{version}-rel.setup.{pkg}', 'couchbase-server-{edition}_{version}-windows_{arch}.{pkg}', 'couchbase_server-{edition}-windows-{arch}-{version}.{pkg}', ) else: patterns = () # Sentinel for pattern in patterns: yield pattern.format(**self.build.__dict__) @staticmethod def is_exist(url): try: status_code = requests.head(url).status_code except ConnectionError: return False if status_code == 200: return True return False def find_package(self): for filename in self.get_expected_filenames(): for location in LOCATIONS: url = '{}{}'.format(location.format(**self.build.__dict__), filename) if self.is_exist(url): return filename, url logger.interrupt('Target build not found') def kill_processes(self): self.remote.kill_processes() def uninstall_package(self): self.remote.uninstall_couchbase(self.build.pkg) def clean_data(self): self.remote.clean_data() def install_package(self): if self.options.version: filename, url = self.find_package() else: url = self.options.url filename = urlparse(url).path.split('/')[-1] logger.info('Using this URL: {}'.format(url)) self.remote.install_couchbase(self.build.pkg, url, filename, self.build.release) def install(self): self.kill_processes() self.uninstall_package() self.clean_data() self.install_package()
class SGPerfTest(PerfTest): COLLECTORS = { 'disk': False, 'ns_server': False, 'ns_server_overview': False, 'active_tasks': False, 'syncgateway_stats': True } ALL_HOSTNAMES = True LOCAL_DIR = "YCSB" def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig, verbose: bool): self.cluster_spec = cluster_spec self.test_config = test_config self.memcached = MemcachedHelper(test_config) self.remote = RemoteHelper(cluster_spec, test_config, verbose) self.rest = RestHelper(cluster_spec) # self.build = os.environ.get('SGBUILD') or "0.0.0-000" self.master_node = next(cluster_spec.masters) self.build = self.rest.get_sgversion(self.master_node) self.metrics = MetricHelper(self) self.reporter = ShowFastReporter(cluster_spec, test_config, self.build) if self.test_config.test_case.use_workers: self.worker_manager = WorkerManager(cluster_spec, test_config, verbose) self.settings = self.test_config.access_settings self.settings.syncgateway_settings = self.test_config.syncgateway_settings self.profiler = Profiler(cluster_spec, test_config) self.cluster = ClusterManager(cluster_spec, test_config) self.target_iterator = TargetIterator(cluster_spec, test_config) self.monitor = Monitor(cluster_spec, test_config, verbose) def download_ycsb(self): if self.worker_manager.is_remote: self.remote.clone_ycsb( repo=self.test_config.syncgateway_settings.repo, branch=self.test_config.syncgateway_settings.branch, worker_home=self.worker_manager.WORKER_HOME, ycsb_instances=int(self.test_config.syncgateway_settings. instances_per_client)) else: local.clone_ycsb( repo=self.test_config.syncgateway_settings.repo, branch=self.test_config.syncgateway_settings.branch) def collect_execution_logs(self): if self.worker_manager.is_remote: if os.path.exists(self.LOCAL_DIR): shutil.rmtree(self.LOCAL_DIR, ignore_errors=True) os.makedirs(self.LOCAL_DIR) self.remote.get_syncgateway_YCSB_logs( self.worker_manager.WORKER_HOME, self.test_config.syncgateway_settings, self.LOCAL_DIR) def run_sg_phase(self, phase: str, task: Callable, settings: PhaseSettings, timer: int = None, distribute: bool = False) -> None: logger.info('Running {}: {}'.format(phase, pretty_dict(settings))) self.worker_manager.run_sg_tasks(task, settings, timer, distribute, phase) self.worker_manager.wait_for_workers() def start_memcached(self): self.run_sg_phase("start memcached", syncgateway_task_start_memcached, self.settings, self.settings.time, False) def load_users(self): self.run_sg_phase("load users", syncgateway_task_load_users, self.settings, self.settings.time, False) def init_users(self): if self.test_config.syncgateway_settings.auth == 'true': self.run_sg_phase("init users", syncgateway_task_init_users, self.settings, self.settings.time, False) def grant_access(self): if self.test_config.syncgateway_settings.grant_access == 'true': self.run_sg_phase("grant access to users", syncgateway_task_grant_access, self.settings, self.settings.time, False) def load_docs(self): self.run_sg_phase("load docs", syncgateway_task_load_docs, self.settings, self.settings.time, False) @with_stats @with_profiles def run_test(self): self.run_sg_phase("run test", syncgateway_task_run_test, self.settings, self.settings.time, True) def compress_sg_logs(self): self.remote.compress_sg_logs() def get_sg_logs(self): initial_nodes = int(self.test_config.syncgateway_settings.nodes) ssh_user, ssh_pass = self.cluster_spec.ssh_credentials for _server in range(initial_nodes): server = self.cluster_spec.servers[_server] local.get_sg_logs(host=server, ssh_user=ssh_user, ssh_pass=ssh_pass) def run(self): self.download_ycsb() self.start_memcached() self.load_users() self.load_docs() self.init_users() self.grant_access() self.run_test() self.report_kpi() def __exit__(self, exc_type, exc_val, exc_tb): if self.test_config.test_case.use_workers: self.worker_manager.download_celery_logs() self.worker_manager.terminate() if self.test_config.cluster.online_cores: self.remote.enable_cpu() if self.test_config.cluster.kernel_mem_limit: self.remote.reset_memory_settings() self.monitor.wait_for_servers()
class ClusterManager: def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig, verbose: bool = False): self.cluster_spec = cluster_spec self.test_config = test_config self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec, verbose) self.monitor = Monitor(cluster_spec, test_config, verbose) self.memcached = MemcachedHelper(test_config) self.master_node = next(self.cluster_spec.masters) self.initial_nodes = test_config.cluster.initial_nodes def is_compatible(self, min_release: str) -> bool: for master in self.cluster_spec.masters: version = self.rest.get_version(master) return version >= min_release def set_data_path(self): for server in self.cluster_spec.servers: self.remote.change_owner(server, self.cluster_spec.data_path) self.rest.set_data_path(server, self.cluster_spec.data_path) def set_index_path(self): for server in self.cluster_spec.servers: self.remote.change_owner(server, self.cluster_spec.index_path) self.rest.set_index_path(server, self.cluster_spec.index_path) def set_analytics_path(self): paths = [] for path in self.cluster_spec.analytics_paths: for i in range(self.test_config.analytics_settings.num_io_devices): io_device = '{}/dev{}'.format(path, i) paths.append(io_device) for server in self.cluster_spec.servers_by_role('cbas'): for path in self.cluster_spec.analytics_paths: self.remote.change_owner(server, path) self.rest.set_analytics_paths(server, paths) def rename(self): for server in self.cluster_spec.servers: self.rest.rename(server) def set_auth(self): for server in self.cluster_spec.servers: self.rest.set_auth(server) def set_mem_quotas(self): for master in self.cluster_spec.masters: self.rest.set_mem_quota(master, self.test_config.cluster.mem_quota) self.rest.set_index_mem_quota( master, self.test_config.cluster.index_mem_quota) if self.test_config.cluster.fts_index_mem_quota: self.rest.set_fts_index_mem_quota( master, self.test_config.cluster.fts_index_mem_quota) if self.test_config.cluster.analytics_mem_quota: self.rest.set_analytics_mem_quota( master, self.test_config.cluster.analytics_mem_quota) if self.test_config.cluster.eventing_mem_quota: self.rest.set_eventing_mem_quota( master, self.test_config.cluster.eventing_mem_quota) def set_query_settings(self): logger.info('Setting query settings') query_nodes = self.cluster_spec.servers_by_role('n1ql') if query_nodes: settings = self.test_config.n1ql_settings.cbq_settings if settings: self.rest.set_query_settings(query_nodes[0], settings) settings = self.rest.get_query_settings(query_nodes[0]) settings = pretty_dict(settings) logger.info('Query settings: {}'.format(settings)) def set_index_settings(self): logger.info('Setting index settings') index_nodes = self.cluster_spec.servers_by_role('index') if index_nodes: settings = self.test_config.gsi_settings.settings if settings: self.rest.set_index_settings(index_nodes[0], settings) settings = self.rest.get_index_settings(index_nodes[0]) settings = pretty_dict(settings) logger.info('Index settings: {}'.format(settings)) def set_services(self): if not self.is_compatible(min_release='4.0.0'): return for master in self.cluster_spec.masters: roles = self.cluster_spec.roles[master] self.rest.set_services(master, roles) def add_nodes(self): for (_, servers), initial_nodes in zip(self.cluster_spec.clusters, self.initial_nodes): if initial_nodes < 2: # Single-node cluster continue master = servers[0] for node in servers[1:initial_nodes]: roles = self.cluster_spec.roles[node] self.rest.add_node(master, node, roles) def rebalance(self): for (_, servers), initial_nodes in zip(self.cluster_spec.clusters, self.initial_nodes): master = servers[0] known_nodes = servers[:initial_nodes] ejected_nodes = [] self.rest.rebalance(master, known_nodes, ejected_nodes) self.monitor.monitor_rebalance(master) self.wait_until_healthy() def increase_bucket_limit(self, num_buckets: int): for master in self.cluster_spec.masters: self.rest.increase_bucket_limit(master, num_buckets) def flush_buckets(self): for master in self.cluster_spec.masters: for bucket_name in self.test_config.buckets: self.rest.flush_bucket(host=master, bucket=bucket_name) def delete_buckets(self): for master in self.cluster_spec.masters: for bucket_name in self.test_config.buckets: self.rest.delete_bucket(host=master, name=bucket_name) def create_buckets(self): mem_quota = self.test_config.cluster.mem_quota if self.test_config.cluster.num_buckets > 7: self.increase_bucket_limit(self.test_config.cluster.num_buckets + 3) if self.test_config.cluster.eventing_metadata_bucket_mem_quota: mem_quota -= ( self.test_config.cluster.eventing_metadata_bucket_mem_quota + self.test_config.cluster.eventing_bucket_mem_quota) per_bucket_quota = mem_quota // self.test_config.cluster.num_buckets for master in self.cluster_spec.masters: for bucket_name in self.test_config.buckets: self.rest.create_bucket( host=master, name=bucket_name, ram_quota=per_bucket_quota, password=self.test_config.bucket.password, replica_number=self.test_config.bucket.replica_number, replica_index=self.test_config.bucket.replica_index, eviction_policy=self.test_config.bucket.eviction_policy, bucket_type=self.test_config.bucket.bucket_type, conflict_resolution_type=self.test_config.bucket. conflict_resolution_type, compression_mode=self.test_config.bucket.compression_mode, ) def create_eventing_buckets(self): if not self.test_config.cluster.eventing_bucket_mem_quota: return per_bucket_quota = \ self.test_config.cluster.eventing_bucket_mem_quota \ // self.test_config.cluster.eventing_buckets for master in self.cluster_spec.masters: for bucket_name in self.test_config.eventing_buckets: self.rest.create_bucket( host=master, name=bucket_name, ram_quota=per_bucket_quota, password=self.test_config.bucket.password, replica_number=self.test_config.bucket.replica_number, replica_index=self.test_config.bucket.replica_index, eviction_policy=self.test_config.bucket.eviction_policy, bucket_type=self.test_config.bucket.bucket_type, conflict_resolution_type=self.test_config.bucket. conflict_resolution_type, ) def create_eventing_metadata_bucket(self): if not self.test_config.cluster.eventing_metadata_bucket_mem_quota: return for master in self.cluster_spec.masters: self.rest.create_bucket( host=master, name=self.test_config.cluster.EVENTING_METADATA_BUCKET_NAME, ram_quota=self.test_config.cluster. eventing_metadata_bucket_mem_quota, password=self.test_config.bucket.password, replica_number=self.test_config.bucket.replica_number, replica_index=self.test_config.bucket.replica_index, eviction_policy=self.test_config.bucket.EVICTION_POLICY, bucket_type=self.test_config.bucket.BUCKET_TYPE, ) def configure_auto_compaction(self): compaction_settings = self.test_config.compaction for master in self.cluster_spec.masters: self.rest.configure_auto_compaction(master, compaction_settings) settings = self.rest.get_auto_compaction_settings(master) logger.info('Auto-compaction settings: {}'.format( pretty_dict(settings))) def configure_internal_settings(self): internal_settings = self.test_config.internal_settings for master in self.cluster_spec.masters: for parameter, value in internal_settings.items(): self.rest.set_internal_settings(master, {parameter: maybe_atoi(value)}) def configure_xdcr_settings(self): xdcr_cluster_settings = self.test_config.xdcr_cluster_settings for master in self.cluster_spec.masters: for parameter, value in xdcr_cluster_settings.items(): self.rest.set_xdcr_cluster_settings( master, {parameter: maybe_atoi(value)}) def tweak_memory(self): self.remote.reset_swap() self.remote.drop_caches() self.remote.set_swappiness() self.remote.disable_thp() def enable_n2n_encryption(self): if self.test_config.cluster.enable_n2n_encryption: for master in self.cluster_spec.masters: self.remote.enable_n2n_encryption( master, self.test_config.cluster.enable_n2n_encryption) def restart_with_alternative_num_vbuckets(self): num_vbuckets = self.test_config.cluster.num_vbuckets if num_vbuckets is not None: self.remote.restart_with_alternative_num_vbuckets(num_vbuckets) def restart_with_alternative_bucket_options(self): """Apply custom buckets settings. Tune bucket settings (e.g., max_num_shards or max_num_auxio) using "/diag/eval" and restart the entire cluster. """ if self.test_config.bucket_extras: self.remote.enable_nonlocal_diag_eval() cmd = 'ns_bucket:update_bucket_props("{}", ' \ '[{{extra_config_string, "{}={}"}}]).' for option, value in self.test_config.bucket_extras.items(): if option == 'num_writer_threads': self.rest.set_num_writer_threads(self.master_node, int(value)) elif option == 'num_reader_threads': self.rest.set_num_reader_threads(self.master_node, int(value)) else: logger.info('Changing {} to {}'.format(option, value)) for master in self.cluster_spec.masters: for bucket in self.test_config.buckets: diag_eval = cmd.format(bucket, option, value) self.rest.run_diag_eval(master, diag_eval) if self.test_config.bucket_extras: self.remote.restart() self.wait_until_healthy() def tune_logging(self): self.remote.tune_log_rotation() self.remote.restart() def enable_auto_failover(self): enabled = self.test_config.bucket.autofailover_enabled failover_min = self.test_config.bucket.failover_min failover_max = self.test_config.bucket.failover_max for master in self.cluster_spec.masters: self.rest.set_auto_failover(master, enabled, failover_min, failover_max) def wait_until_warmed_up(self): if self.test_config.bucket.bucket_type in ('ephemeral', 'memcached'): return for master in self.cluster_spec.masters: for bucket in self.test_config.buckets: self.monitor.monitor_warmup(self.memcached, master, bucket) def wait_until_healthy(self): for master in self.cluster_spec.masters: self.monitor.monitor_node_health(master) for analytics_node in self.rest.get_active_nodes_by_role( master, 'cbas'): self.monitor.monitor_analytics_node_active(analytics_node) def gen_disabled_audit_events(self, master: str) -> List[str]: curr_settings = self.rest.get_audit_settings(master) curr_disabled = {str(event) for event in curr_settings['disabled']} disabled = curr_disabled - self.test_config.audit_settings.extra_events return list(disabled) def enable_audit(self): if not self.is_compatible(min_release='4.0.0') or \ self.rest.is_community(self.master_node): return if not self.test_config.audit_settings.enabled: return for master in self.cluster_spec.masters: disabled = [] if self.test_config.audit_settings.extra_events: disabled = self.gen_disabled_audit_events(master) self.rest.enable_audit(master, disabled) def generate_ce_roles(self) -> List[str]: return ['admin'] def generate_ee_roles(self) -> List[str]: existing_roles = { r['role'] for r in self.rest.get_rbac_roles(self.master_node) } roles = [] for role in ( 'bucket_admin', 'data_dcp_reader', 'data_monitoring', 'data_reader_writer', 'data_reader', 'data_writer', 'fts_admin', 'fts_searcher', 'query_delete', 'query_insert', 'query_select', 'query_update', 'views_admin', ): if role in existing_roles: roles.append(role + '[{bucket}]') return roles def delete_rbac_users(self): if not self.is_compatible(min_release='5.0'): return for master in self.cluster_spec.masters: for bucket in self.test_config.buckets: self.rest.delete_rbac_user(host=master, bucket=bucket) def add_rbac_users(self): if not self.rest.supports_rbac(self.master_node): logger.info('RBAC not supported - skipping adding RBAC users') return if self.rest.is_community(self.master_node): roles = self.generate_ce_roles() else: roles = self.generate_ee_roles() for master in self.cluster_spec.masters: admin_user, admin_password = self.cluster_spec.rest_credentials self.rest.add_rbac_user( host=master, user=admin_user, password=admin_password, roles=['admin'], ) for bucket in self.test_config.buckets: bucket_roles = [role.format(bucket=bucket) for role in roles] bucket_roles.append("admin") self.rest.add_rbac_user( host=master, user=bucket, # Backward compatibility password=self.test_config.bucket.password, roles=bucket_roles, ) def throttle_cpu(self): if self.remote.os == 'Cygwin': return self.remote.enable_cpu() if self.test_config.cluster.online_cores: self.remote.disable_cpu(self.test_config.cluster.online_cores) def tune_memory_settings(self): kernel_memory = self.test_config.cluster.kernel_mem_limit if kernel_memory: for service in self.test_config.cluster.kernel_mem_limit_services: for server in self.cluster_spec.servers_by_role(service): self.remote.tune_memory_settings(host_string=server, size=kernel_memory) self.monitor.wait_for_servers() def reset_memory_settings(self): for service in self.test_config.cluster.kernel_mem_limit_services: for server in self.cluster_spec.servers_by_role(service): self.remote.reset_memory_settings(host_string=server) self.monitor.wait_for_servers() def flush_iptables(self): self.remote.flush_iptables() def clear_login_history(self): self.remote.clear_wtmp() def disable_wan(self): self.remote.disable_wan() def enable_ipv6(self): if self.test_config.cluster.ipv6: self.remote.enable_ipv6() def set_x509_certificates(self): logger.info('Setting x509 settings') if self.test_config.access_settings.ssl_mode == "auth": self.remote.setup_x509() for host in self.cluster_spec.servers: self.rest.upload_cluster_certificate(host) for host in self.cluster_spec.servers: self.rest.reload_cluster_certificate(host) self.rest.enable_certificate_auth(host) def set_cipher_suite(self): if self.test_config.access_settings.cipher_list: check_cipher_suit = self.rest.get_cipher_suite(self.master_node) logger.info('current cipher suit: {}'.format(check_cipher_suit)) self.rest.set_cipher_suite( self.master_node, self.test_config.access_settings.cipher_list) check_cipher_suit = self.rest.get_cipher_suite(self.master_node) logger.info('new cipher suit: {}'.format(check_cipher_suit)) def set_min_tls_version(self): if self.test_config.access_settings.min_tls_version: check_tls_version = self.rest.get_minimum_tls_version( self.master_node) logger.info('current tls version: {}'.format(check_tls_version)) self.rest.set_minimum_tls_version( self.master_node, self.test_config.access_settings.min_tls_version) check_tls_version = self.rest.get_minimum_tls_version( self.master_node) logger.info('new tls version: {}'.format(check_tls_version))
class CouchbaseInstaller: def __init__(self, cluster_spec, options): self.remote = RemoteHelper(cluster_spec, options.verbose) self.options = options self.cluster_spec = cluster_spec @property def url(self) -> str: if validators.url(self.options.couchbase_version): return self.options.couchbase_version else: return self.find_package(edition=self.options.edition) @property def release(self) -> str: return self.options.couchbase_version.split('-')[0] @property def build(self) -> str: split = self.options.couchbase_version.split('-') if len(split) > 1: return split[1] def find_package(self, edition: str, package: str = None, os_release: str = None) -> [str, str]: for url in self.url_iterator(edition, package, os_release): if self.is_exist(url): return url logger.interrupt('Target build not found') def url_iterator(self, edition: str, package: str = None, os_release: str = None) -> Iterator[str]: if package is None: if self.remote.package == 'rpm': if self.cluster_spec.cloud_infrastructure: os_arch = self.cluster_spec.infrastructure_settings.get('os_arch', 'x86_64') if os_arch == 'arm': os_release = 'amzn2.aarch64' elif os_arch == 'al2': os_release == 'amzn2.x86_64' else: os_release = self.remote.detect_centos_release() else: os_release = self.remote.detect_centos_release() elif self.remote.package == 'deb': os_release = self.remote.detect_ubuntu_release() package = self.remote.package for pkg_pattern in PKG_PATTERNS[package]: for loc_pattern in LOCATIONS: url = loc_pattern + pkg_pattern yield url.format(release=self.release, build=self.build, edition=edition, os=os_release) @staticmethod def is_exist(url): try: status_code = requests.head(url).status_code except ConnectionError: return False if status_code == 200: return True return False def download(self): """Download and save a copy of the specified package.""" if self.remote.package == 'rpm': logger.info('Saving a local copy of {}'.format(self.url)) with open('couchbase.rpm', 'wb') as fh: resp = requests.get(self.url) fh.write(resp.content) else: logger.interrupt('Unsupported package format') def download_local(self, local_copy_url: str = None): """Download and save a copy of the specified package.""" try: if RemoteHelper.detect_server_os("127.0.0.1", self.cluster_spec).\ upper() in ('UBUNTU', 'DEBIAN'): os_release = detect_ubuntu_release() if local_copy_url: url = local_copy_url else: url = self.find_package(edition=self.options.edition, package="deb", os_release=os_release) logger.info('Saving a local copy of {}'.format(url)) with open('couchbase.deb', 'wb') as fh: resp = requests.get(url) fh.write(resp.content) except (Exception, BaseException): logger.info("Saving local copy for ubuntu failed, package may not present") def download_remote(self): """Download and save a copy of the specified package on a remote client.""" if self.remote.package == 'rpm': logger.info('Saving a remote copy of {}'.format(self.url)) self.wget(url=self.url) else: logger.interrupt('Unsupported package format') @master_client def wget(self, url): logger.info('Fetching {}'.format(url)) with cd('/tmp'): run('wget -nc "{}"'.format(url)) package = url.split('/')[-1] run('mv {} couchbase.rpm'.format(package)) def kill_processes(self): self.remote.kill_processes() def uninstall_package(self): self.remote.uninstall_couchbase() def clean_data(self): self.remote.clean_data() def install_package(self): logger.info('Using this URL: {}'.format(self.url)) self.remote.upload_iss_files(self.release) self.remote.install_couchbase(self.url) def install(self): self.kill_processes() self.uninstall_package() self.clean_data() self.install_package()
def __init__(self, cluster_spec, test_config, verbose): self.cluster_spec = cluster_spec self.test_config = test_config self.remote = RemoteHelper(cluster_spec, test_config, verbose)
class OperatorInstaller: def __init__(self, cluster_spec, options): self.options = options self.cluster_spec = cluster_spec self.operator_version = self.options.operator_version if "-" in self.operator_version: self.operator_release = self.operator_version.split("-")[0] self.operator_tag = 'registry.gitlab.com/cb-vanilla/operator:{}'\ .format(self.operator_version) self.admission_controller_release = self.operator_version.split("-")[0] self.admission_controller_tag = \ 'registry.gitlab.com/cb-vanilla/admission-controller:{}' \ .format(self.operator_version) else: self.operator_release = self.operator_version self.operator_tag = 'couchbase/operator:{}'\ .format(self.operator_version) self.admission_controller_release = self.operator_version self.admission_controller_tag = 'couchbase/admission-controller:{}' \ .format(self.operator_version) self.couchbase_version = self.options.couchbase_version if "-" in self.couchbase_version: self.couchbase_release = self.couchbase_version.split("-")[0] self.couchbase_tag = 'registry.gitlab.com/cb-vanilla/server:{}'\ .format(self.couchbase_version) else: self.couchbase_release = self.couchbase_version self.couchbase_tag = 'couchbase/server:{}'\ .format(self.couchbase_version) self.operator_backup_version = self.options.operator_backup_version if self.operator_backup_version: if "-" in self.operator_backup_version: self.operator_backup_release = self.operator_backup_version.split("-")[0] self.operator_backup_tag = 'registry.gitlab.com/cb-vanilla/operator-backup:{}'\ .format(self.operator_backup_version) else: self.operator_backup_release = self.operator_backup_version self.operator_backup_tag = 'couchbase/operator-backup/{}'\ .format(self.operator_backup_version) else: self.operator_backup_tag = 'registry.gitlab.com/cb-vanilla/operator-backup:latest' self.node_count = len(self.cluster_spec.infrastructure_clusters['couchbase1'].split()) self.remote = RemoteHelper(cluster_spec) self.docker_config_path = os.path.expanduser("~") + "/.docker/config.json" self.operator_base_path = "cloud/operator/{}/{}"\ .format(self.operator_release.split(".")[0], self.operator_release.split(".")[1]) self.certificate_authority_path = "{}/ca.crt"\ .format(self.operator_base_path) self.crd_path = "{}/crd.yaml"\ .format(self.operator_base_path) self.config_path = "{}/config.yaml"\ .format(self.operator_base_path) self.config_template_path = "{}/config_template.yaml"\ .format(self.operator_base_path) self.auth_path = "{}/auth_secret.yaml"\ .format(self.operator_base_path) self.cb_cluster_path = "{}/couchbase-cluster.yaml"\ .format(self.operator_base_path) self.template_cb_cluster_path = "{}/couchbase-cluster_template.yaml"\ .format(self.operator_base_path) self.worker_base_path = "cloud/worker" self.worker_path = "{}/worker.yaml"\ .format(self.worker_base_path) self.rmq_base_path = "cloud/broker/rabbitmq/0.48" self.rmq_operator_path = "{}/cluster-operator.yaml"\ .format(self.rmq_base_path) self.rmq_cluster_path = "{}/rabbitmq.yaml"\ .format(self.rmq_base_path) def install(self): self.install_operator() self.install_celery_broker() def install_operator(self): logger.info("installing operator") self.create_secrets() self.create_crd() self.create_config() self.wait_for_operator_and_admission() self.create_auth() self.create_cluster() self.wait_for_cluster() def install_celery_broker(self): logger.info("installing celery broker") self.create_rabbitmq_operator() self.wait_for_rabbitmq_operator() self.create_rabbitmq_cluster() self.wait_for_rabbitmq_cluster() self.creating_rabbitmq_config() def uninstall(self): self.uninstall_operator() self.uninstall_celery_broker() self.uninstall_workers() self.delete_artifacts() def uninstall_operator(self): logger.info("uninstalling operator") self.delete_operator_files() self.delete_operator_secrets() self.wait_for_operator_deletion() def uninstall_celery_broker(self): logger.info("uninstalling celery broker") self.delete_rabbitmq_files() self.wait_for_rabbitmq_deletion() def uninstall_workers(self): logger.info("uninstall workers") self.delete_worker_files() self.wait_for_worker_deletion() def create_secrets(self): logger.info("creating secrets") self.remote.create_docker_secret( self.docker_config_path) self.remote.create_operator_tls_secret( self.certificate_authority_path) def create_crd(self): logger.info("creating CRD") self.remote.create_from_file(self.crd_path) def create_config(self): logger.info("creating config") self.remote.create_operator_config( self.config_template_path, self.config_path, self.operator_tag, self.admission_controller_tag) def create_auth(self): logger.info("creating auth") self.remote.create_from_file(self.auth_path) def create_cluster(self): logger.info("creating couchbase cluster") self.remote.create_couchbase_cluster( self.template_cb_cluster_path, self.cb_cluster_path, self.couchbase_tag, self.operator_backup_tag, self.node_count) def wait_for_operator_and_admission(self): logger.info("waiting for operator and admission controller") self.remote.wait_for_admission_controller_ready() self.remote.wait_for_operator_ready() def wait_for_cluster(self): logger.info("waiting for cluster") self.remote.wait_for_couchbase_pods_ready(self.node_count) def create_rabbitmq_operator(self): logger.info("creating rabbitmq operator") self.remote.create_from_file(self.rmq_operator_path) def wait_for_rabbitmq_operator(self): logger.info("waiting for rabbitmq operator") self.remote.wait_for_rabbitmq_operator_ready() def create_rabbitmq_cluster(self): logger.info("creating rabbitmq cluster") self.remote.create_from_file(self.rmq_cluster_path) def wait_for_rabbitmq_cluster(self): logger.info("waiting for rabbitmq cluster") self.remote.wait_for_rabbitmq_broker_ready() def creating_rabbitmq_config(self): logger.info("creating rabbitmq config") self.remote.upload_rabbitmq_config() def delete_operator_files(self): logger.info("deleting operator files") files = [self.cb_cluster_path, self.auth_path, self.config_path, self.crd_path] self.remote.delete_from_files(files) def delete_operator_secrets(self): logger.info("deleting operator secrets") secrets = ['regcred', 'couchbase-operator-tls', 'couchbase-server-tls', 'user-password-secret'] self.remote.delete_secrets(secrets) def wait_for_operator_deletion(self): logger.info("waiting for operator deletion") self.remote.wait_for_operator_deletion() def delete_rabbitmq_files(self): logger.info("deleting rabbit mq files") self.remote.delete_from_files( [self.rmq_cluster_path, self.rmq_operator_path]) def wait_for_rabbitmq_deletion(self): logger.info("waiting for rabbitmq deletion") self.remote.wait_for_rabbitmq_deletion() def delete_worker_files(self): logger.info("deleting worker files") self.remote.delete_from_file(self.worker_path) def wait_for_worker_deletion(self): logger.info("waiting for worker deletion") self.remote.wait_for_workers_deletion() def delete_artifacts(self): logger.info("deleting any artifact pods, pvcs, and backups") self.remote.delete_all_backups() self.remote.delete_all_pods() self.remote.delete_all_pvc()
class CbAgent(object): def __init__(self, test): self.clusters = OrderedDict() self.remote = RemoteHelper(test.cluster_spec, test.test_config, verbose=True) for cluster_name, servers in test.cluster_spec.yield_clusters(): cluster = "{}_{}_{}".format(cluster_name, test.build.replace(".", ""), uhex()[:3]) master = servers[0].split(":")[0] self.clusters[cluster] = master if test.test_config.test_case.monitor_clients: for node in test.cluster_spec.workers: cluster = "{}{}".format(self.clusters.items()[0][0][:-3], uhex()[:3]) master = node.split(":")[0] self.clusters[cluster] = master self.index_node = "" for _, servers in test.cluster_spec.yield_servers_by_role("index"): if servers: self.index_node = servers[0].split(":")[0] if hasattr(test, "ALL_BUCKETS"): buckets = None else: buckets = test.test_config.buckets[:1] if hasattr(test, "ALL_HOSTNAMES"): hostnames = tuple(test.cluster_spec.yield_hostnames()) else: hostnames = None self.settings = type( "settings", (object,), { "seriesly_host": test.test_config.stats_settings.seriesly["host"], "cbmonitor_host_port": test.test_config.stats_settings.cbmonitor["host"], "interval": test.test_config.stats_settings.interval, "secondary_statsfile": test.test_config.stats_settings.secondary_statsfile, "buckets": buckets, "hostnames": hostnames, "sync_gateway_nodes": test.remote.gateways if test.remote else None, "monitor_clients": test.cluster_spec.workers if test.test_config.test_case.monitor_clients else None, }, )() self.lat_interval = test.test_config.stats_settings.lat_interval if test.cluster_spec.ssh_credentials: self.settings.ssh_username, self.settings.ssh_password = test.cluster_spec.ssh_credentials self.settings.rest_username, self.settings.rest_password = test.cluster_spec.rest_credentials self.settings.bucket_password = test.test_config.bucket.password self.settings.index_node = self.index_node self.collectors = [] self.processes = [] self.snapshots = [] self.bandwidth = False def prepare_collectors( self, test, bandwidth=False, subdoc_latency=False, latency=False, secondary_stats=False, query_latency=False, spatial_latency=False, n1ql_latency=False, n1ql_stats=False, index_latency=False, persist_latency=False, replicate_latency=False, xdcr_lag=False, secondary_latency=False, secondary_debugstats=False, ): clusters = self.clusters.keys() self.bandwidth = bandwidth self.prepare_ns_server(clusters) self.prepare_active_tasks(clusters) if test.remote is None or test.remote.os != "Cygwin": self.prepare_ps(clusters) self.prepare_net(clusters) self.prepare_iostat(clusters, test) elif test.remote.os == "Cygwin": self.prepare_tp(clusters) if subdoc_latency: self.prepare_subdoc_latency(clusters, test) if latency: self.prepare_latency(clusters, test) if query_latency: self.prepare_query_latency(clusters, test) if spatial_latency: self.prepare_spatial_latency(clusters, test) if n1ql_latency: self.prepare_n1ql_latency(clusters, test) if secondary_stats: self.prepare_secondary_stats(clusters) if secondary_debugstats: self.prepare_secondary_debugstats(clusters) if secondary_latency: self.prepare_secondary_latency(clusters) if n1ql_stats: self.prepare_n1ql_stats(clusters) if index_latency: self.prepare_index_latency(clusters) if persist_latency: self.prepare_persist_latency(clusters) if replicate_latency: self.prepare_replicate_latency(clusters) if xdcr_lag: self.prepare_xdcr_lag(clusters) def prepare_ns_server(self, clusters): for cluster in clusters: settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] collector = NSServer(settings) try: sum(1 for _ in collector.get_buckets()) self.collectors.append(collector) except RuntimeError: pass def prepare_secondary_stats(self, clusters): for cluster in clusters: settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] self.collectors.append(SecondaryStats(settings)) def prepare_secondary_debugstats(self, clusters): for cluster in clusters: settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] self.collectors.append(SecondaryDebugStats(settings)) def prepare_secondary_latency(self, clusters): for cluster in clusters: settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] self.collectors.append(SecondaryLatencyStats(settings)) def prepare_n1ql_stats(self, clusters): for cluster in clusters: settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] self.collectors.append(N1QLStats(settings)) def prepare_ps(self, clusters): for cluster in clusters: settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] ps_collector = PS(settings) self.collectors.append(ps_collector) def prepare_tp(self, clusters): for cluster in clusters: settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] tp_collector = TypePerf(settings) self.collectors.append(tp_collector) def prepare_net(self, clusters): for cluster in clusters: settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] net_collector = Net(settings) self.collectors.append(net_collector) def prepare_iostat(self, clusters, test): # If tests are run locally, no paths are defined, hence # use the paths that are set by the server itself. Get # those paths via ther REST API rest = None if test.cluster_spec.paths: data_path, index_path = test.cluster_spec.paths else: rest = RestHelper(test.cluster_spec) for cluster in clusters: settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] if rest is not None: data_path, index_path = rest.get_data_path(settings.master_node) partitions = {"data": data_path} if hasattr(test, "ddocs"): # all instances of IndexTest have it partitions["index"] = index_path settings.partitions = partitions io_collector = IO(settings) self.collectors.append(io_collector) def prepare_persist_latency(self, clusters): for i, cluster in enumerate(clusters): settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] settings.observe = "persist" self.collectors.append(ObserveLatency(settings)) def prepare_replicate_latency(self, clusters): for i, cluster in enumerate(clusters): settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] settings.observe = "replicate" self.collectors.append(ObserveLatency(settings)) def prepare_index_latency(self, clusters): for i, cluster in enumerate(clusters): settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] settings.observe = "index" self.collectors.append(ObserveLatency(settings)) def prepare_xdcr_lag(self, clusters): reversed_clusters = list(reversed(self.clusters.keys())) for i, cluster in enumerate(clusters): settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] dest_cluster = reversed_clusters[i] settings.dest_master_node = self.clusters[dest_cluster] self.collectors.append(XdcrLag(settings)) def prepare_latency(self, clusters, test): for cluster in clusters: settings = copy(self.settings) settings.interval = self.lat_interval settings.cluster = cluster settings.master_node = self.clusters[cluster] prefix = test.target_iterator.prefix or target_hash(settings.master_node.split(":")[0]) self.collectors.append(SpringLatency(settings, test.workload, prefix)) def prepare_subdoc_latency(self, clusters, test): for cluster in clusters: settings = copy(self.settings) settings.interval = self.lat_interval settings.cluster = cluster settings.master_node = self.clusters[cluster] prefix = test.target_iterator.prefix or target_hash(settings.master_node.split(":")[0]) self.collectors.append(SpringSubdocLatency(settings, test.workload, prefix)) def prepare_query_latency(self, clusters, test): params = test.test_config.index_settings.params index_type = test.test_config.index_settings.index_type for cluster in clusters: settings = copy(self.settings) settings.interval = self.lat_interval settings.cluster = cluster settings.master_node = self.clusters[cluster] prefix = test.target_iterator.prefix or target_hash(settings.master_node.split(":")[0]) self.collectors.append( SpringQueryLatency( settings, test.workload, prefix=prefix, ddocs=test.ddocs, params=params, index_type=index_type ) ) def prepare_spatial_latency(self, clusters, test): for cluster in clusters: settings = copy(self.settings) settings.interval = self.lat_interval settings.cluster = cluster settings.master_node = self.clusters[cluster] prefix = test.target_iterator.prefix or target_hash(settings.master_node.split(":")[0]) self.collectors.append( SpringSpatialQueryLatency( settings, test.workload, prefix=prefix, spatial_settings=test.test_config.spatial_settings ) ) def prepare_n1ql_latency(self, clusters, test): default_queries = test.test_config.access_settings.n1ql_queries self.settings.new_n1ql_queries = getattr(test, "n1ql_queries", default_queries) for cluster in clusters: settings = copy(self.settings) settings.interval = self.lat_interval settings.cluster = cluster settings.master_node = self.clusters[cluster] self.collectors.append(SpringN1QLQueryLatency(settings, test.workload, prefix="n1ql")) def prepare_active_tasks(self, clusters): for cluster in clusters: settings = copy(self.settings) settings.cluster = cluster settings.master_node = self.clusters[cluster] collector = ActiveTasks(settings) try: sum(1 for _ in collector.get_buckets()) self.collectors.append(collector) except RuntimeError: pass def update_metadata(self): for collector in self.collectors: collector.update_metadata() def start(self): if self.bandwidth: self.remote.start_bandwidth_monitor() self.processes = [Process(target=c.collect) for c in self.collectors] map(lambda p: p.start(), self.processes) def stop(self): map(lambda p: p.terminate(), self.processes) if self.bandwidth: self.remote.stop_bandwidth_monitor() return datetime.utcnow() def trigger_reports(self, snapshot): for report_type in ("html", "get_corr_matrix"): url = "http://{}/reports/{}/?snapshot={}".format(self.settings.cbmonitor_host_port, report_type, snapshot) logger.info(url) requests.get(url=url) def add_snapshot(self, phase, ts_from, ts_to): for i, cluster in enumerate(self.clusters, start=1): snapshot = "{}_{}".format(cluster, phase) self.settings.cluster = cluster md_client = MetadataClient(self.settings) md_client.add_snapshot(snapshot, ts_from, ts_to) self.snapshots.append(snapshot) self.trigger_reports(snapshot)
class KubernetesRestHelper(RestBase): def __init__(self, cluster_spec: ClusterSpec): super().__init__(cluster_spec=cluster_spec) self.remote = RemoteHelper(cluster_spec) self.ip_table, self.port_translation = self.remote.get_ip_port_mapping( ) def translate_host_and_port(self, host, port): trans_host = self.ip_table.get(host) trans_port = self.port_translation.get(trans_host).get(str(port)) return trans_host, trans_port def exec_n1ql_statement(self, host: str, statement: str) -> dict: host, port = self.translate_host_and_port(host, '8093') api = 'http://{}:{}/query/service'\ .format(host, port) data = { 'statement': statement, } response = self.post(url=api, data=data) return response.json() # indexer endpoints not yet exposed by operator def get_index_status(self, host: str) -> dict: return {'status': [{'status': 'Ready'}]} # indexer endpoints not yet exposed by operator def get_gsi_stats(self, host: str) -> dict: return {'num_docs_queued': 0, 'num_docs_pending': 0} def get_active_nodes_by_role(self, master_node: str, role: str) -> List[str]: active_nodes_by_role = [] for node in self.cluster_spec.servers_by_role(role): active_nodes_by_role.append(node) return active_nodes_by_role def node_statuses(self, host: str) -> dict: host, port = self.translate_host_and_port(host, '8091') api = 'http://{}:{}/nodeStatuses'\ .format(host, port) data = self.get(url=api).json() return {node: info['status'] for node, info in data.items()} def get_version(self, host: str) -> str: logger.info('Getting Couchbase Server version') host, port = self.translate_host_and_port(host, '8091') api = 'http://{}:{}/pools/'\ .format(host, port) r = self.get(url=api).json() return r['implementationVersion'] \ .replace('-rel-enterprise', '') \ .replace('-enterprise', '') \ .replace('-community', '') def get_bucket_stats(self, host: str, bucket: str) -> dict: host, port = self.translate_host_and_port(host, '8091') api = 'http://{}:{}/pools/default/buckets/{}/stats'\ .format(host, port, bucket) return self.get(url=api).json() def get_bucket_info(self, host: str, bucket: str) -> List[str]: host, port = self.translate_host_and_port(host, '8091') api = 'http://{}:{}/pools/default/buckets/{}'\ .format(host, port, bucket) return self.get(url=api).json()
class DefaultMonitor(DefaultRestHelper): MAX_RETRY = 150 MAX_RETRY_RECOVERY = 1200 MAX_RETRY_TIMER_EVENT = 18000 MAX_RETRY_BOOTSTRAP = 1200 MONITORING_DELAY = 5 POLLING_INTERVAL = 2 POLLING_INTERVAL_INDEXING = 1 POLLING_INTERVAL_MACHINE_UP = 10 POLLING_INTERVAL_ANALYTICS = 15 POLLING_INTERVAL_EVENTING = 1 REBALANCE_TIMEOUT = 3600 * 6 TIMEOUT = 3600 * 12 DISK_QUEUES = ( 'ep_queue_size', 'ep_flusher_todo', 'ep_diskqueue_items', 'vb_active_queue_size', 'vb_replica_queue_size', ) DCP_QUEUES = ( 'ep_dcp_replica_items_remaining', 'ep_dcp_other_items_remaining', ) XDCR_QUEUES = ('replication_changes_left', ) def __init__(self, cluster_spec, test_config, verbose): super().__init__(cluster_spec=cluster_spec) self.cluster_spec = cluster_spec self.test_config = test_config self.remote = RemoteHelper(cluster_spec, verbose) self.master_node = next(cluster_spec.masters) self.build = self.get_version(self.master_node) version, build_number = self.build.split('-') self.build_version_number = tuple(map( int, version.split('.'))) + (int(build_number), ) def monitor_rebalance(self, host): logger.info('Monitoring rebalance status') is_running = True last_progress = 0 last_progress_time = time.time() while is_running: time.sleep(self.POLLING_INTERVAL) is_running, progress = self.get_task_status(host, task_type='rebalance') if progress == last_progress: if time.time() - last_progress_time > self.REBALANCE_TIMEOUT: logger.error('Rebalance hung') break else: last_progress = progress last_progress_time = time.time() if progress is not None: logger.info('Rebalance progress: {} %'.format(progress)) logger.info('Rebalance completed') def _wait_for_empty_queues(self, host, bucket, queues, stats_function): metrics = list(queues) start_time = time.time() while metrics: bucket_stats = stats_function(host, bucket) # As we are changing metrics in the loop; take a copy of it to # iterate over. for metric in list(metrics): stats = bucket_stats['op']['samples'].get(metric) if stats: last_value = stats[-1] if last_value: logger.info('{} = {:,}'.format(metric, last_value)) continue else: logger.info('{} reached 0'.format(metric)) metrics.remove(metric) else: logger.info('{} reached 0'.format(metric)) metrics.remove(metric) if metrics: time.sleep(self.POLLING_INTERVAL) if time.time() - start_time > self.TIMEOUT: raise Exception('Monitoring got stuck') def _wait_for_empty_dcp_queues(self, host, bucket, stats_function): start_time = time.time() while True: kv_dcp_stats = stats_function(host, bucket) stats = int(kv_dcp_stats["data"][0]["values"][-1][1]) if stats: logger.info('{} = {}'.format('ep_dcp_replica_items_remaining', stats)) if time.time() - start_time > self.TIMEOUT: raise Exception('Monitoring got stuck') time.sleep(self.POLLING_INTERVAL) else: logger.info( '{} reached 0'.format('ep_dcp_replica_items_remaining')) break def _wait_for_replica_count_match(self, host, bucket): start_time = time.time() bucket_info = self.get_bucket_info(host, bucket) replica_number = int(bucket_info['replicaNumber']) while replica_number: bucket_stats = self.get_bucket_stats(host, bucket) curr_items = bucket_stats['op']['samples'].get("curr_items")[-1] replica_curr_items = bucket_stats['op']['samples'].get( "vb_replica_curr_items")[-1] logger.info("curr_items: {}, replica_curr_items: {}".format( curr_items, replica_curr_items)) if (curr_items * replica_number) == replica_curr_items: break time.sleep(self.POLLING_INTERVAL) if time.time() - start_time > self.TIMEOUT: raise Exception('Replica items monitoring got stuck') def _wait_for_replication_completion(self, host, bucket, queues, stats_function, link1, link2): metrics = list(queues) completion_count = 0 link1_time = 0 link2_items = 0 link1_compelteness_str = \ 'replications/{}/bucket-1/bucket-1/percent_completeness'.format(link1) link2_compelteness_str = \ 'replications/{}/bucket-1/bucket-1/percent_completeness'.format(link2) link2_items_str = \ 'replications/{}/bucket-1/bucket-1/docs_written'.format(link2) start_time = time.time() while metrics: bucket_stats = stats_function(host, bucket) # As we are changing metrics in the loop; take a copy of it to # iterate over. for metric in list(metrics): stats = bucket_stats['op']['samples'].get(metric) if stats: last_value = stats[-1] if last_value: logger.info('{} = {:,}'.format(metric, last_value)) link1_completeness = \ bucket_stats['op']['samples'].get(link1_compelteness_str)[-1] link2_completeness = \ bucket_stats['op']['samples'].get(link2_compelteness_str)[-1] if link1_completeness == 100 or \ link2_completeness == 100: if link1_completeness == 100: if completion_count == 0: link1_time = time.time() link2_items = \ bucket_stats['op']['samples'].get(link2_items_str)[-1] completion_count = completion_count + 1 elif link2_completeness == 100: if completion_count == 0: link1_time = time.time() link2_items = \ bucket_stats['op']['samples'].get(link2_items_str)[-1] completion_count = completion_count + 1 continue else: logger.info('{} reached 0'.format(metric)) if completion_count == 0: link1_time = time.time() link2_items = \ bucket_stats['op']['samples'].get(link2_items_str)[-1] completion_count = completion_count + 1 metrics.remove(metric) if metrics: time.sleep(self.POLLING_INTERVAL) if time.time() - start_time > self.TIMEOUT: raise Exception('Monitoring got stuck') return link1_time, link2_items def _wait_for_completeness(self, host, bucket, xdcr_link, stats_function): metrics = [] metrics.append(xdcr_link) start_time = time.time() while metrics: bucket_stats = stats_function(host, bucket) for metric in metrics: stats = bucket_stats['op']['samples'].get(metric) if stats: last_value = stats[0] if last_value != 100: logger.info('{} : {}'.format(metric, last_value)) elif last_value == 100: logger.info('{} Completed 100 %'.format(metric)) metrics.remove(metric) if metrics: time.sleep(self.POLLING_INTERVAL) if time.time() - start_time > self.TIMEOUT: raise Exception('Monitoring got stuck') def monitor_disk_queues(self, host, bucket): logger.info('Monitoring disk queues: {}'.format(bucket)) self._wait_for_empty_queues(host, bucket, self.DISK_QUEUES, self.get_bucket_stats) def monitor_dcp_queues(self, host, bucket): logger.info('Monitoring DCP queues: {}'.format(bucket)) if self.build_version_number < (7, 0, 0, 3937): self._wait_for_empty_queues(host, bucket, self.DCP_QUEUES, self.get_bucket_stats) else: if self.test_config.bucket.replica_number != 0: self._wait_for_empty_dcp_queues(host, bucket, self.get_dcp_replication_items) self.DCP_QUEUES = ('ep_dcp_other_items_remaining', ) self._wait_for_empty_queues(host, bucket, self.DCP_QUEUES, self.get_bucket_stats) def monitor_replica_count(self, host, bucket): logger.info('Monitoring replica count match: {}'.format(bucket)) self._wait_for_replica_count_match(host, bucket) def _wait_for_xdcr_to_start(self, host: str): is_running = False while not is_running: time.sleep(self.POLLING_INTERVAL) is_running, _ = self.get_task_status(host, task_type='xdcr') def xdcr_link_starttime(self, host: str, uuid: str): is_running = False while not is_running: time.sleep(self.POLLING_INTERVAL) is_running, _ = self.get_xdcrlink_status(host, task_type='xdcr', uuid=uuid) return time.time() def monitor_xdcr_queues(self, host: str, bucket: str): logger.info('Monitoring XDCR queues: {}'.format(bucket)) self._wait_for_xdcr_to_start(host) # adding temporary delay to make sure replication_changes_left stats arrives time.sleep(20) self._wait_for_empty_queues(host, bucket, self.XDCR_QUEUES, self.get_xdcr_stats) def monitor_xdcr_changes_left(self, host: str, bucket: str, xdcrlink1: str, xdcrlink2: str): logger.info('Monitoring XDCR queues: {}'.format(bucket)) self._wait_for_xdcr_to_start(host) start_time = time.time() link1_time, link2_items = self._wait_for_replication_completion( host, bucket, self.XDCR_QUEUES, self.get_xdcr_stats, xdcrlink1, xdcrlink2) return start_time, link1_time, link2_items def monitor_xdcr_completeness(self, host: str, bucket: str, xdcr_link: str): logger.info('Monitoring XDCR Link Completeness: {}'.format(bucket)) self._wait_for_completeness(host=host, bucket=bucket, xdcr_link=xdcr_link, stats_function=self.get_xdcr_stats) return time.time() def get_num_items(self, host: str, bucket: str): num_items = self._get_num_items(host, bucket, total=True) return num_items def _get_num_items(self, host: str, bucket: str, total: bool = False) -> int: stats = self.get_bucket_stats(host=host, bucket=bucket) if total: curr_items = stats['op']['samples'].get('curr_items_tot') else: curr_items = stats['op']['samples'].get('curr_items') if curr_items: return curr_items[-1] return 0 def monitor_num_items(self, host: str, bucket: str, num_items: int): logger.info('Checking the number of items in {}'.format(bucket)) retries = 0 while retries < self.MAX_RETRY: curr_items = self._get_num_items(host, bucket, total=True) if curr_items == num_items: break else: logger.info('{}(curr_items) != {}(num_items)'.format( curr_items, num_items)) time.sleep(self.POLLING_INTERVAL) retries += 1 else: actual_items = self._get_num_items(host, bucket, total=True) raise Exception( 'Mismatch in the number of items: {}'.format(actual_items)) def monitor_num_backfill_items(self, host: str, bucket: str, num_items: int): logger.info('Checking the number of items in {}'.format(bucket)) t0 = time.time() while True: curr_items = self._get_num_items(host, bucket, total=True) if curr_items == num_items: t1 = time.time() break else: logger.info('{}(curr_items) != {}(num_items)'.format( curr_items, num_items)) time.sleep(self.POLLING_INTERVAL) return t1 - t0 def monitor_task(self, host, task_type): logger.info('Monitoring task: {}'.format(task_type)) time.sleep(self.MONITORING_DELAY * 2) while True: time.sleep(self.POLLING_INTERVAL) tasks = [ task for task in self.get_tasks(host) if task.get('type') == task_type ] if tasks: for task in tasks: logger.info('{}: {}%, bucket: {}, ddoc: {}'.format( task_type, task.get('progress'), task.get('bucket'), task.get('designDocument'))) else: break logger.info('Task {} successfully completed'.format(task_type)) def monitor_warmup(self, memcached, host, bucket): logger.info('Monitoring warmup status: {}@{}'.format(bucket, host)) memcached_port = self.get_memcached_port(host) while True: stats = memcached.get_stats(host, memcached_port, bucket, 'warmup') if 'ep_warmup_state' in stats: state = stats['ep_warmup_state'] if state == 'done': return float(stats.get('ep_warmup_time', 0)) else: logger.info('Warmpup status: {}'.format(state)) time.sleep(self.POLLING_INTERVAL) else: logger.info('No warmup stats are available, continue polling') time.sleep(self.POLLING_INTERVAL) def monitor_compression(self, memcached, host, bucket): logger.info('Monitoring active compression status') memcached_port = self.get_memcached_port(host) json_docs = -1 while json_docs: stats = memcached.get_stats(host, memcached_port, bucket) json_docs = int(stats['ep_active_datatype_json']) if json_docs: logger.info('Still uncompressed: {:,} items'.format(json_docs)) time.sleep(self.POLLING_INTERVAL) logger.info('All items are compressed') def monitor_node_health(self, host): logger.info('Monitoring node health') for retry in range(self.MAX_RETRY): unhealthy_nodes = { n for n, status in self.node_statuses(host).items() if status != 'healthy' } | { n for n, status in self.node_statuses_v2(host).items() if status != 'healthy' } if unhealthy_nodes: time.sleep(self.POLLING_INTERVAL) else: break else: logger.interrupt( 'Some nodes are not healthy: {}'.format(unhealthy_nodes)) def monitor_analytics_node_active(self, host): logger.info('Monitoring analytics node health') for retry in range(self.MAX_RETRY): active = self.analytics_node_active(host) if active: break else: time.sleep(self.POLLING_INTERVAL) else: logger.interrupt( 'Analytics node still not healthy: {}'.format(host)) def is_index_ready(self, host: str) -> bool: for status in self.get_index_status(host)['status']: if status['status'] != 'Ready': return False return True def estimate_pending_docs(self, host: str) -> int: stats = self.get_gsi_stats(host) pending_docs = 0 for metric, value in stats.items(): if 'num_docs_queued' in metric or 'num_docs_pending' in metric: pending_docs += value return pending_docs def monitor_indexing(self, host): logger.info('Monitoring indexing progress') while not self.is_index_ready(host): time.sleep(self.POLLING_INTERVAL_INDEXING * 5) pending_docs = self.estimate_pending_docs(host) logger.info('Pending docs: {:,}'.format(pending_docs)) logger.info('Indexing completed') def wait_for_secindex_init_build(self, host, indexes): # POLL until initial index build is complete logger.info("Waiting for the following indexes to be ready: {}".format( indexes)) indexes_ready = [0 for _ in indexes] def get_index_status(json2i, index): """Return the index status.""" for d in json2i["status"]: if d["name"] == index: return d["status"] return None @misc.retry(catch=(KeyError, ), iterations=10, wait=30) def update_indexes_ready(): json2i = self.get_index_status(host) for i, index in enumerate(indexes): status = get_index_status(json2i, index) if status == 'Ready': indexes_ready[i] = 1 init_ts = time.time() while sum(indexes_ready) != len(indexes): time.sleep(self.POLLING_INTERVAL_INDEXING) update_indexes_ready() finish_ts = time.time() logger.info('secondary index build time: {}'.format(finish_ts - init_ts)) time_elapsed = round(finish_ts - init_ts) return time_elapsed def wait_for_secindex_init_build_collections(self, host, indexes): # POLL until initial index build is complete index_list = [] for bucket_name, scope_map in indexes.items(): for scope_name, collection_map in scope_map.items(): for collection_name, index_map in collection_map.items(): for index_name in index_map.keys(): index_list.append(index_name) indexes = index_list logger.info("Waiting for the following indexes to be ready: {}".format( indexes)) indexes_ready = [0 for _ in indexes] def get_index_status(json2i, index): """Return the index status.""" for d in json2i["status"]: if d["name"] == index: return d["status"] return None @misc.retry(catch=(KeyError, ), iterations=10, wait=30) def update_indexes_ready(): json2i = self.get_index_status(host) for i, index in enumerate(indexes): status = get_index_status(json2i, index) if status == 'Ready': indexes_ready[i] = 1 while sum(indexes_ready) != len(indexes): time.sleep(self.POLLING_INTERVAL_INDEXING * 10) update_indexes_ready() logger.info('secondary index build complete: {}'.format(indexes)) def wait_for_secindex_incr_build(self, index_nodes, bucket, indexes, numitems): # POLL until incremenal index build is complete logger.info('expecting {} num_docs_indexed for indexes {}'.format( numitems, indexes)) # collect num_docs_indexed information globally from all index nodes def get_num_docs_indexed(): data = self.get_index_stats(index_nodes) num_indexed = [] for index in indexes: key = "" + bucket + ":" + index + ":num_docs_indexed" val = data[key] num_indexed.append(val) return num_indexed def get_num_docs_index_pending(): data = self.get_index_stats(index_nodes) num_pending = [] for index in indexes: key = "" + bucket + ":" + index + ":num_docs_pending" val1 = data[key] key = "" + bucket + ":" + index + ":num_docs_queued" val2 = data[key] val = int(val1) + int(val2) num_pending.append(val) return num_pending expected_num_pending = [0] * len(indexes) while True: time.sleep(self.POLLING_INTERVAL_INDEXING) curr_num_pending = get_num_docs_index_pending() if curr_num_pending == expected_num_pending: break curr_num_indexed = get_num_docs_indexed() logger.info("Number of Items indexed {}".format(curr_num_indexed)) def wait_for_secindex_incr_build_collections(self, index_nodes, index_map, expected_num_docs): indexes = [] for bucket_name, scope_map in index_map.items(): for scope_name, collection_map in scope_map.items(): for collection_name, coll_index_map in collection_map.items(): for index_name, index_def in coll_index_map.items(): if scope_name == '_default' \ and collection_name == '_default': target_index = "{}:{}".format( bucket_name, index_name) else: target_index = "{}:{}:{}:{}".format( bucket_name, scope_name, collection_name, index_name) indexes.append(target_index) logger.info('expecting {} num_docs_indexed for indexes {}'.format( expected_num_docs, indexes)) # collect num_docs_indexed information globally from all index nodes def get_num_docs_indexed(): data = self.get_index_stats(index_nodes) num_indexed = [] for index in indexes: key = index + ":num_docs_indexed" val = data[key] num_indexed.append(val) return num_indexed def get_num_docs_index_pending(): data = self.get_index_stats(index_nodes) num_pending = [] for index in indexes: key = index + ":num_docs_pending" val1 = data[key] key = index + ":num_docs_queued" val2 = data[key] val = int(val1) + int(val2) num_pending.append(val) return num_pending expected_num_pending = [0] * len(indexes) while True: time.sleep(self.POLLING_INTERVAL_INDEXING * 10) curr_num_pending = get_num_docs_index_pending() if curr_num_pending == expected_num_pending: break curr_num_indexed = get_num_docs_indexed() logger.info("Number of Items indexed {}".format(curr_num_indexed)) def wait_for_num_connections(self, index_node, expected_connections): curr_connections = self.get_index_num_connections(index_node) retry = 1 while curr_connections < expected_connections and retry < self.MAX_RETRY: time.sleep(self.POLLING_INTERVAL_INDEXING) curr_connections = self.get_index_num_connections(index_node) logger.info("Got current connections {}".format(curr_connections)) retry += 1 if retry == self.MAX_RETRY: return False return True def wait_for_recovery(self, index_nodes, bucket, index): time.sleep(self.MONITORING_DELAY) for retry in range(self.MAX_RETRY_RECOVERY): response = self.get_index_stats(index_nodes) item = "{}:{}:disk_load_duration".format(bucket, index) if item in response: return response[item] else: time.sleep(self.POLLING_INTERVAL) return -1 def wait_for_servers(self): for retry in range(self.MAX_RETRY): logger.info('Waiting for all servers to be available') time.sleep(self.POLLING_INTERVAL_MACHINE_UP) for server in self.cluster_spec.servers: if not self.remote.is_up(server): break else: logger.info('All nodes are up') return logger.interrupt('Some nodes are still down') def monitor_fts_indexing_queue(self, host: str, index: str, items: int): logger.info('Waiting for indexing to finish') count = 0 while count < items: count = self.get_fts_doc_count(host, index) logger.info('FTS indexed documents: {:,}'.format(count)) time.sleep(self.POLLING_INTERVAL) def monitor_fts_index_persistence(self, hosts: list, index: str, bkt: str = None): logger.info('Waiting for index to be persisted') if not bkt: bkt = self.test_config.buckets[0] tries = 0 pending_items = 1 while pending_items: try: persist = 0 compact = 0 for host in hosts: stats = self.get_fts_stats(host) metric = '{}:{}:{}'.format(bkt, index, 'num_recs_to_persist') persist += stats[metric] metric = '{}:{}:{}'.format(bkt, index, 'total_compactions') compact += stats[metric] pending_items = persist or compact logger.info('Records to persist: {:,}'.format(persist)) logger.info('Ongoing compactions: {:,}'.format(compact)) except KeyError: tries += 1 if tries >= 10: raise Exception("cannot get fts stats") time.sleep(self.POLLING_INTERVAL) def monitor_elastic_indexing_queue(self, host: str, index: str): logger.info(' Waiting for indexing to finish') items = int(self.test_config.fts_settings.test_total_docs) count = 0 while count < items: count = self.get_elastic_doc_count(host, index) logger.info('Elasticsearch indexed documents: {:,}'.format(count)) time.sleep(self.POLLING_INTERVAL) def monitor_elastic_index_persistence(self, host: str, index: str): logger.info('Waiting for index to be persisted') pending_items = -1 while pending_items: stats = self.get_elastic_stats(host) pending_items = stats['indices'][index]['total']['translog'][ 'operations'] logger.info('Records to persist: {:,}'.format(pending_items)) time.sleep(self.POLLING_INTERVAL) def wait_for_bootstrap(self, nodes: list, function: str): logger.info( 'Waiting for bootstrap of eventing function: {} '.format(function)) for node in nodes: retry = 1 while retry < self.MAX_RETRY_BOOTSTRAP: if function in self.get_apps_with_status(node, "deployed"): break time.sleep(self.POLLING_INTERVAL) retry += 1 if retry == self.MAX_RETRY_BOOTSTRAP: logger.info( 'Failed to bootstrap function: {}, node: {}'.format( function, node)) def get_num_analytics_items(self, analytics_node: str, bucket: str) -> int: stats_key = '{}:all:incoming_records_count_total'.format(bucket) num_items = 0 for node in self.get_active_nodes_by_role(analytics_node, 'cbas'): stats = self.get_analytics_stats(node) num_items += stats.get(stats_key, 0) return num_items def get_num_remaining_mutations(self, analytics_node: str) -> int: while True: num_items = 0 try: if self.build_version_number < (7, 0, 0, 4622): stats = self.get_pending_mutations(analytics_node) for dataset in stats['Default']: if self.build_version_number < (7, 0, 0, 4310): num_items += int(stats['Default'][dataset]) else: num_items += int( stats['Default'][dataset]['seqnoLag']) else: stats = self.get_pending_mutations_v2(analytics_node) for scope in stats['scopes']: for collection in scope['collections']: num_items += int(collection['seqnoLag']) break except Exception: time.sleep(self.POLLING_INTERVAL_ANALYTICS) return num_items def monitor_data_synced(self, data_node: str, bucket: str, analytics_node: str) -> int: logger.info('Waiting for data to be synced from {}'.format(data_node)) time.sleep(self.MONITORING_DELAY * 3) num_items = self._get_num_items(data_node, bucket) while True: if self.build_version_number < (7, 0, 0, 0): num_analytics_items = self.get_num_analytics_items( analytics_node, bucket) else: incoming_records = self.get_cbas_incoming_records_count( analytics_node) num_analytics_items = int( incoming_records["data"][0]["values"][-1][1]) logger.info('Analytics has {:,} docs (target is {:,})'.format( num_analytics_items, num_items)) if self.build_version_number < (6, 5, 0, 0): if num_analytics_items == num_items: break else: num_remaining_mutations = self.get_num_remaining_mutations( analytics_node) logger.info('Number of remaining mutations: {}'.format( num_remaining_mutations)) if num_remaining_mutations == 0: break time.sleep(self.POLLING_INTERVAL_ANALYTICS) return num_items def monitor_dataset_drop(self, analytics_node: str, dataset: str): while True: statement = "SELECT COUNT(*) from `{}`;".format(dataset) result = self.exec_analytics_query(analytics_node, statement) num_analytics_items = result['results'][0]['$1'] logger.info("Number of items in dataset {}: {}".format( dataset, num_analytics_items)) if num_analytics_items == 0: break time.sleep(self.POLLING_INTERVAL) def wait_for_timer_event(self, node: str, function: str, event="timer_events"): logger.info('Waiting for timer events to start processing: {} '.format( function)) retry = 1 while retry < self.MAX_RETRY_TIMER_EVENT: if 0 < self.get_num_events_processed( event=event, node=node, name=function): break time.sleep(self.POLLING_INTERVAL_EVENTING) retry += 1 if retry == self.MAX_RETRY_TIMER_EVENT: logger.info( 'Failed to get timer event for function: {}'.format(function)) def wait_for_all_mutations_processed(self, host: str, bucket1: str, bucket2: str): logger.info( 'Waiting for mutations to be processed of eventing function') retry = 1 while retry < self.MAX_RETRY_BOOTSTRAP: if self._get_num_items(host=host, bucket=bucket1) == \ self._get_num_items(host=host, bucket=bucket2): break retry += 1 time.sleep(self.POLLING_INTERVAL_EVENTING) if retry == self.MAX_RETRY_BOOTSTRAP: logger.info('Failed to process all mutations... TIMEOUT') def wait_for_all_timer_creation(self, node: str, function: str): logger.info( 'Waiting for all timers to be created by : {} '.format(function)) retry = 1 events_processed = {} while retry < self.MAX_RETRY_TIMER_EVENT: events_processed = self.get_num_events_processed(event="ALL", node=node, name=function) if events_processed["dcp_mutation"] == events_processed[ "timer_responses_received"]: break time.sleep(self.POLLING_INTERVAL_EVENTING) retry += 1 if retry == self.MAX_RETRY_TIMER_EVENT: logger.info('Got only {} timers created for function: {}'.format( events_processed["timer_responses_received"], function)) def wait_for_function_status(self, node: str, function: str, status: str): logger.info('Waiting for {} function to {}'.format(function, status)) retry = 1 while retry < self.MAX_RETRY_TIMER_EVENT: op = self.get_apps_with_status(node, status) if function in op: break time.sleep(self.POLLING_INTERVAL_EVENTING) retry += 1 if retry == self.MAX_RETRY_TIMER_EVENT: logger.info('Function {} failed to {}...!!!'.format( function, status))
class PerfTest: COLLECTORS = {} ROOT_CERTIFICATE = 'root.pem' def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig, verbose: bool): self.cluster_spec = cluster_spec self.test_config = test_config self.target_iterator = TargetIterator(cluster_spec, test_config) self.cluster = ClusterManager(cluster_spec, test_config) self.memcached = MemcachedHelper(test_config) self.monitor = Monitor(cluster_spec, test_config, verbose) self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec, verbose) self.profiler = Profiler(cluster_spec, test_config) self.master_node = next(cluster_spec.masters) self.build = self.rest.get_version(self.master_node) self.metrics = MetricHelper(self) self.reporter = ShowFastReporter(cluster_spec, test_config, self.build) self.cbmonitor_snapshots = [] self.cbmonitor_clusters = [] if self.test_config.test_case.use_workers: self.worker_manager = WorkerManager(cluster_spec, test_config, verbose) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): failure = self.debug() self.tear_down() if exc_type == KeyboardInterrupt: logger.warn('The test was interrupted') return True if failure: logger.interrupt(failure) @property def query_nodes(self) -> List[str]: return self.rest.get_active_nodes_by_role(self.master_node, 'n1ql') @property def index_nodes(self) -> List[str]: return self.rest.get_active_nodes_by_role(self.master_node, 'index') @property def fts_nodes(self) -> List[str]: return self.rest.get_active_nodes_by_role(self.master_node, 'fts') @property def analytics_nodes(self) -> List[str]: return self.rest.get_active_nodes_by_role(self.master_node, 'cbas') @property def eventing_nodes(self) -> List[str]: return self.rest.get_active_nodes_by_role(self.master_node, 'eventing') def tear_down(self): if self.test_config.test_case.use_workers: self.worker_manager.download_celery_logs() self.worker_manager.terminate() if self.test_config.cluster.online_cores: self.remote.enable_cpu() if self.test_config.cluster.kernel_mem_limit: self.collect_logs() self.cluster.reset_memory_settings() def collect_logs(self): self.remote.collect_info() for hostname in self.cluster_spec.servers: for fname in glob.glob('{}/*.zip'.format(hostname)): shutil.move(fname, '{}.zip'.format(hostname)) def reset_memory_settings(self): if self.test_config.cluster.kernel_mem_limit: for service in self.test_config.cluster.kernel_mem_limit_services: for server in self.cluster_spec.servers_by_role(service): self.remote.reset_memory_settings(host_string=server) self.monitor.wait_for_servers() def debug(self) -> str: failure = self.check_core_dumps() failure = self.check_rebalance() or failure return self.check_failover() or failure def download_certificate(self): cert = self.rest.get_certificate(self.master_node) with open(self.ROOT_CERTIFICATE, 'w') as fh: fh.write(cert) def check_rebalance(self) -> str: for master in self.cluster_spec.masters: if self.rest.is_not_balanced(master): return 'The cluster is not balanced' def check_failover(self) -> Optional[str]: if hasattr(self, 'rebalance_settings'): if self.rebalance_settings.failover or \ self.rebalance_settings.graceful_failover: return for master in self.cluster_spec.masters: num_failovers = self.rest.get_failover_counter(master) if num_failovers: return 'Failover happened {} time(s)'.format(num_failovers) def check_core_dumps(self) -> str: dumps_per_host = self.remote.detect_core_dumps() core_dumps = { host: dumps for host, dumps in dumps_per_host.items() if dumps } if core_dumps: return pretty_dict(core_dumps) def restore(self): logger.info('Restoring data') self.remote.restore_data( self.test_config.restore_settings.backup_storage, self.test_config.restore_settings.backup_repo, ) def restore_local(self): logger.info('Restoring data') local.extract_cb(filename='couchbase.rpm') local.cbbackupmgr_restore( master_node=self.master_node, cluster_spec=self.cluster_spec, threads=self.test_config.restore_settings.threads, archive=self.test_config.restore_settings.backup_storage, repo=self.test_config.restore_settings.backup_repo, ) def import_data(self): logger.info('Importing data') for bucket in self.test_config.buckets: self.remote.import_data( self.test_config.restore_settings.import_file, bucket, ) def compact_bucket(self, wait: bool = True): for target in self.target_iterator: self.rest.trigger_bucket_compaction(target.node, target.bucket) if wait: for target in self.target_iterator: self.monitor.monitor_task(target.node, 'bucket_compaction') def wait_for_persistence(self): for target in self.target_iterator: self.monitor.monitor_disk_queues(target.node, target.bucket) self.monitor.monitor_dcp_queues(target.node, target.bucket) self.monitor.monitor_replica_count(target.node, target.bucket) def wait_for_indexing(self): if self.test_config.index_settings.statements: for server in self.index_nodes: self.monitor.monitor_indexing(server) def check_num_items(self): num_items = self.test_config.load_settings.items * ( 1 + self.test_config.bucket.replica_number) for target in self.target_iterator: self.monitor.monitor_num_items(target.node, target.bucket, num_items) def reset_kv_stats(self): master_node = next(self.cluster_spec.masters) for bucket in self.test_config.buckets: for server in self.rest.get_server_list(master_node, bucket): port = self.rest.get_memcached_port(server) self.memcached.reset_stats(server, port, bucket) def create_indexes(self): logger.info('Creating and building indexes') if not self.test_config.index_settings.couchbase_fts_index_name: for statement in self.test_config.index_settings.statements: self.rest.exec_n1ql_statement(self.query_nodes[0], statement) else: self.create_fts_index_n1ql() def create_fts_index_n1ql(self): definition = read_json( self.test_config.index_settings.couchbase_fts_index_configfile) definition.update( {'name': self.test_config.index_settings.couchbase_fts_index_name}) logger.info('Index definition: {}'.format(pretty_dict(definition))) self.rest.create_fts_index( self.fts_nodes[0], self.test_config.index_settings.couchbase_fts_index_name, definition) self.monitor.monitor_fts_indexing_queue( self.fts_nodes[0], self.test_config.index_settings.couchbase_fts_index_name, int(self.test_config.access_settings.items)) def create_functions(self): logger.info('Creating n1ql functions') for statement in self.test_config.n1ql_function_settings.statements: self.rest.exec_n1ql_statement(self.query_nodes[0], statement) def sleep(self): access_settings = self.test_config.access_settings logger.info('Running phase for {} seconds'.format( access_settings.time)) time.sleep(access_settings.time) def run_phase(self, phase: str, task: Callable, settings: PhaseSettings, target_iterator: Iterable, timer: int = None, wait: bool = True): logger.info('Running {}: {}'.format(phase, pretty_dict(settings))) self.worker_manager.run_tasks(task, settings, target_iterator, timer) if wait: self.worker_manager.wait_for_workers() def load(self, task: Callable = spring_task, settings: PhaseSettings = None, target_iterator: Iterable = None): if settings is None: settings = self.test_config.load_settings if target_iterator is None: target_iterator = self.target_iterator self.run_phase('load phase', task, settings, target_iterator) def hot_load(self, task: Callable = spring_task): settings = self.test_config.hot_load_settings self.run_phase('hot load phase', task, settings, self.target_iterator) def xattr_load(self, task: Callable = spring_task, target_iterator: Iterable = None): if target_iterator is None: target_iterator = self.target_iterator settings = self.test_config.xattr_load_settings self.run_phase('xattr phase', task, settings, target_iterator) def access(self, task: Callable = spring_task, settings: PhaseSettings = None, target_iterator: Iterable = None): if settings is None: settings = self.test_config.access_settings if target_iterator is None: target_iterator = self.target_iterator self.run_phase('access phase', task, settings, target_iterator, timer=settings.time) def access_bg(self, task: Callable = spring_task, settings: PhaseSettings = None, target_iterator: Iterable = None): if settings is None: settings = self.test_config.access_settings if target_iterator is None: target_iterator = self.target_iterator self.run_phase('background access phase', task, settings, target_iterator, timer=settings.time, wait=False) def report_kpi(self, *args, **kwargs): if self.test_config.stats_settings.enabled: self._report_kpi(*args, **kwargs) def _report_kpi(self, *args, **kwargs): pass def _measure_curr_ops(self) -> int: ops = 0 for bucket in self.test_config.buckets: for server in self.rest.get_active_nodes_by_role( self.master_node, "kv"): port = self.rest.get_memcached_port(server) stats = self.memcached.get_stats(server, port, bucket) for stat in b'cmd_get', b'cmd_set': ops += int(stats[stat]) return ops
class ClusterManager(object): def __init__(self, cluster_spec, test_config, verbose): self.cluster_spec = cluster_spec self.test_config = test_config self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec, test_config, verbose) self.monitor = Monitor(cluster_spec) self.memcached = MemcachedHelper(test_config) self.clusters = cluster_spec.yield_clusters self.servers = cluster_spec.yield_servers self.masters = cluster_spec.yield_masters self.initial_nodes = test_config.cluster.initial_nodes self.mem_quota = test_config.cluster.mem_quota self.index_mem_quota = test_config.cluster.index_mem_quota self.fts_index_mem_quota = test_config.cluster.fts_index_mem_quota self.group_number = test_config.cluster.group_number or 1 self.roles = cluster_spec.roles def set_data_path(self): if self.cluster_spec.paths: data_path, index_path = self.cluster_spec.paths for server in self.servers(): self.rest.set_data_path(server, data_path, index_path) def set_auth(self): for server in self.servers(): self.rest.set_auth(server) def set_mem_quota(self): for server in self.servers(): self.rest.set_mem_quota(server, self.mem_quota) def set_index_mem_quota(self): for server in self.servers(): self.rest.set_index_mem_quota(server, self.index_mem_quota) def set_fts_index_mem_quota(self): for server in self.servers(): self.rest.set_fts_index_mem_quota(server, self.fts_index_mem_quota) def set_query_settings(self): settings = self.test_config.n1ql_settings.settings for _, servers in self.cluster_spec.yield_servers_by_role('n1ql'): for server in servers: self.rest.set_query_settings(server, settings) def set_index_settings(self): if self.test_config.secondaryindex_settings.db != 'memdb': settings = self.test_config.secondaryindex_settings.settings for _, servers in self.cluster_spec.yield_servers_by_role('index'): for server in servers: self.rest.set_index_settings(server, settings) self.remote.restart() time.sleep(60) else: logger.info("DB type is memdb. Not setting the indexer settings to take the default settings") def set_services(self): for (_, servers), initial_nodes in zip(self.clusters(), self.initial_nodes): master = servers[0] self.rest.set_services(master, self.roles[master]) def disable_moxi(self): if self.test_config.cluster.disable_moxi is not None: self.remote.disable_moxi() def create_server_groups(self): for master in self.masters(): for i in range(1, self.group_number): name = 'Group {}'.format(i + 1) self.rest.create_server_group(master, name=name) def add_nodes(self): for (_, servers), initial_nodes in zip(self.clusters(), self.initial_nodes): if initial_nodes < 2: # Single-node cluster continue # Adding initial nodes master = servers[0] if self.group_number > 1: groups = self.rest.get_server_groups(master) else: groups = {} for i, host_port in enumerate(servers[1:initial_nodes], start=1): uri = groups.get(server_group(servers[:initial_nodes], self.group_number, i)) self.rest.add_node(master, host_port, self.roles[host_port], uri) # Rebalance master = servers[0] known_nodes = servers[:initial_nodes] ejected_nodes = [] self.rest.rebalance(master, known_nodes, ejected_nodes) self.monitor.monitor_rebalance(master) def create_buckets(self, empty_buckets=False): ram_quota = self.mem_quota / (self.test_config.cluster.num_buckets + self.test_config.cluster.emptybuckets) replica_number = self.test_config.bucket.replica_number replica_index = self.test_config.bucket.replica_index eviction_policy = self.test_config.bucket.eviction_policy threads_number = self.test_config.bucket.threads_number proxy_port = self.test_config.bucket.proxy_port password = self.test_config.bucket.password buckets = self.test_config.emptybuckets if empty_buckets else self.test_config.buckets for master in self.masters(): for bucket_name in buckets: self.rest.create_bucket(host_port=master, name=bucket_name, ram_quota=ram_quota, replica_number=replica_number, replica_index=replica_index, eviction_policy=eviction_policy, threads_number=threads_number, password=password, proxy_port=proxy_port) def configure_auto_compaction(self): compaction_settings = self.test_config.compaction for master in self.masters(): self.rest.configure_auto_compaction(master, compaction_settings) def configure_internal_settings(self): internal_settings = self.test_config.internal_settings for master in self.masters(): for parameter, value in internal_settings.items(): self.rest.set_internal_settings(master, {parameter: int(value)}) def configure_xdcr_settings(self): xdcr_cluster_settings = self.test_config.xdcr_cluster_settings for master in self.masters(): for parameter, value in xdcr_cluster_settings.items(): self.rest.set_xdcr_cluster_settings(master, {parameter: int(value)}) def tweak_memory(self): self.remote.reset_swap() self.remote.drop_caches() self.remote.set_swappiness() self.remote.disable_thp() def restart_with_alternative_num_vbuckets(self): num_vbuckets = self.test_config.cluster.num_vbuckets if num_vbuckets is not None: self.remote.restart_with_alternative_num_vbuckets(num_vbuckets) def restart_with_alternative_bucket_options(self): cmd = 'ns_bucket:update_bucket_props("{}", ' \ '[{{extra_config_string, "{}={}"}}]).' for option in ('defragmenter_enabled', 'exp_pager_stime', 'ht_locks', 'max_num_shards', 'max_threads', 'warmup_min_memory_threshold', 'bfilter_enabled'): value = getattr(self.test_config.bucket, option) if value != -1 and value is not None: logger.info('Changing {} to {}'.format(option, value)) for master in self.masters(): for bucket in self.test_config.buckets: diag_eval = cmd.format(bucket, option, value) self.rest.run_diag_eval(master, diag_eval) self.remote.restart() def tune_logging(self): self.remote.tune_log_rotation() self.remote.restart() def restart_with_alternative_num_cpus(self): num_cpus = self.test_config.cluster.num_cpus if num_cpus: self.remote.restart_with_alternative_num_cpus(num_cpus) def restart_with_tcmalloc_aggressive_decommit(self): if self.test_config.cluster.tcmalloc_aggressive_decommit: self.remote.restart_with_tcmalloc_aggressive_decommit() def restart_with_sfwi(self): if self.test_config.cluster.sfwi: self.remote.restart_with_sfwi() def enable_auto_failover(self): for master in self.masters(): self.rest.enable_auto_failover(master) def wait_until_warmed_up(self): target_iterator = TargetIterator(self.cluster_spec, self.test_config) for target in target_iterator: self.monitor.monitor_warmup(self.memcached, target.node, target.bucket) def wait_until_healthy(self): for master in self.cluster_spec.yield_masters(): self.monitor.monitor_node_health(master) def change_watermarks(self): watermark_settings = self.test_config.watermark_settings for host_port, initial_nodes in zip(self.servers(), self.initial_nodes): host = host_port.split(':')[0] memcached_port = 11210 for bucket in self.test_config.buckets: for key, val in watermark_settings.items(): val = self.memcached.calc_watermark(val, self.mem_quota) self.memcached.set_flusher_param(host, memcached_port, bucket, key, val) def start_cbq_engine(self): if self.test_config.cluster.run_cbq: self.remote.start_cbq() def change_dcp_io_threads(self): if self.test_config.secondaryindex_settings.db == 'memdb': self.remote.set_dcp_io_threads() time.sleep(30) self.remote.restart()
class Monitor(RestHelper): MAX_RETRY = 60 MAX_RETRY_RECOVERY = 1200 MAX_RETRY_TIMER_EVENT = 18000 MAX_RETRY_BOOTSTRAP = 1200 MONITORING_DELAY = 5 POLLING_INTERVAL = 2 POLLING_INTERVAL_INDEXING = 1 POLLING_INTERVAL_MACHINE_UP = 10 POLLING_INTERVAL_ANALYTICS = 15 POLLING_INTERVAL_EVENTING = 1 REBALANCE_TIMEOUT = 3600 * 6 TIMEOUT = 3600 * 12 DISK_QUEUES = ( 'ep_queue_size', 'ep_flusher_todo', 'ep_diskqueue_items', 'vb_active_queue_size', 'vb_replica_queue_size', ) DCP_QUEUES = ( 'ep_dcp_replica_items_remaining', 'ep_dcp_other_items_remaining', ) XDCR_QUEUES = ('replication_changes_left', ) def __init__(self, cluster_spec, test_config, verbose): super().__init__(cluster_spec=cluster_spec) self.cluster_spec = cluster_spec self.test_config = test_config self.remote = RemoteHelper(cluster_spec, verbose) def monitor_rebalance(self, host): logger.info('Monitoring rebalance status') is_running = True last_progress = 0 last_progress_time = time.time() while is_running: time.sleep(self.POLLING_INTERVAL) is_running, progress = self.get_task_status(host, task_type='rebalance') if progress == last_progress: if time.time() - last_progress_time > self.REBALANCE_TIMEOUT: logger.error('Rebalance hung') break else: last_progress = progress last_progress_time = time.time() if progress is not None: logger.info('Rebalance progress: {} %'.format(progress)) logger.info('Rebalance completed') def _wait_for_empty_queues(self, host, bucket, queues, stats_function): metrics = list(queues) start_time = time.time() while metrics: bucket_stats = stats_function(host, bucket) # As we are changing metrics in the loop; take a copy of it to # iterate over. for metric in list(metrics): stats = bucket_stats['op']['samples'].get(metric) if stats: last_value = stats[-1] if last_value: logger.info('{} = {:,}'.format(metric, last_value)) continue else: logger.info('{} reached 0'.format(metric)) metrics.remove(metric) if metrics: time.sleep(self.POLLING_INTERVAL) if time.time() - start_time > self.TIMEOUT: raise Exception('Monitoring got stuck') def monitor_disk_queues(self, host, bucket): logger.info('Monitoring disk queues: {}'.format(bucket)) self._wait_for_empty_queues(host, bucket, self.DISK_QUEUES, self.get_bucket_stats) def monitor_dcp_queues(self, host, bucket): logger.info('Monitoring DCP queues: {}'.format(bucket)) self._wait_for_empty_queues(host, bucket, self.DCP_QUEUES, self.get_bucket_stats) def _wait_for_xdcr_to_start(self, host: str): is_running = False while not is_running: time.sleep(self.POLLING_INTERVAL) is_running, _ = self.get_task_status(host, task_type='xdcr') def monitor_xdcr_queues(self, host: str, bucket: str): logger.info('Monitoring XDCR queues: {}'.format(bucket)) self._wait_for_xdcr_to_start(host) self._wait_for_empty_queues(host, bucket, self.XDCR_QUEUES, self.get_xdcr_stats) def _get_num_items(self, host: str, bucket: str, total: bool = False) -> int: stats = self.get_bucket_stats(host=host, bucket=bucket) if total: curr_items = stats['op']['samples'].get('curr_items_tot') else: curr_items = stats['op']['samples'].get('curr_items') if curr_items: return curr_items[-1] return 0 def monitor_num_items(self, host: str, bucket: str, num_items: int): logger.info('Checking the number of items in {}'.format(bucket)) retries = 0 while retries < self.MAX_RETRY: if self._get_num_items(host, bucket, total=True) == num_items: break time.sleep(self.POLLING_INTERVAL) retries += 1 else: actual_items = self._get_num_items(host, bucket, total=True) raise Exception( 'Mismatch in the number of items: {}'.format(actual_items)) def monitor_task(self, host, task_type): logger.info('Monitoring task: {}'.format(task_type)) time.sleep(self.MONITORING_DELAY) while True: time.sleep(self.POLLING_INTERVAL) tasks = [ task for task in self.get_tasks(host) if task.get('type') == task_type ] if tasks: for task in tasks: logger.info('{}: {}%, bucket: {}, ddoc: {}'.format( task_type, task.get('progress'), task.get('bucket'), task.get('designDocument'))) else: break logger.info('Task {} successfully completed'.format(task_type)) def monitor_warmup(self, memcached, host, bucket): logger.info('Monitoring warmup status: {}@{}'.format(bucket, host)) memcached_port = self.get_memcached_port(host) while True: stats = memcached.get_stats(host, memcached_port, bucket, 'warmup') if b'ep_warmup_state' in stats: state = stats[b'ep_warmup_state'] if state == b'done': return float(stats.get(b'ep_warmup_time', 0)) else: logger.info('Warmpup status: {}'.format(state)) time.sleep(self.POLLING_INTERVAL) else: logger.info('No warmup stats are available, continue polling') time.sleep(self.POLLING_INTERVAL) def monitor_compression(self, memcached, host, bucket): logger.info('Monitoring active compression status') memcached_port = self.get_memcached_port(host) json_docs = -1 while json_docs: stats = memcached.get_stats(host, memcached_port, bucket) json_docs = int(stats[b'ep_active_datatype_json']) if json_docs: logger.info('Still uncompressed: {:,} items'.format(json_docs)) time.sleep(self.POLLING_INTERVAL) logger.info('All items are compressed') def monitor_node_health(self, host): logger.info('Monitoring node health') for retry in range(self.MAX_RETRY): unhealthy_nodes = { n for n, status in self.node_statuses(host).items() if status != 'healthy' } | { n for n, status in self.node_statuses_v2(host).items() if status != 'healthy' } if unhealthy_nodes: time.sleep(self.POLLING_INTERVAL) else: break else: logger.interrupt( 'Some nodes are not healthy: {}'.format(unhealthy_nodes)) def monitor_analytics_node_active(self, host): logger.info('Monitoring analytics node health') for retry in range(self.MAX_RETRY): active = self.analytics_node_active(host) if active: break else: time.sleep(self.POLLING_INTERVAL) else: logger.interrupt( 'Analytics node still not healthy: {}'.format(host)) def is_index_ready(self, host: str) -> bool: for status in self.get_index_status(host)['status']: if status['status'] != 'Ready': return False return True def estimate_pending_docs(self, host: str) -> int: stats = self.get_gsi_stats(host) pending_docs = 0 for metric, value in stats.items(): if 'num_docs_queued' in metric or 'num_docs_pending' in metric: pending_docs += value return pending_docs def monitor_indexing(self, host): logger.info('Monitoring indexing progress') while not self.is_index_ready(host): time.sleep(self.POLLING_INTERVAL_INDEXING * 5) pending_docs = self.estimate_pending_docs(host) logger.info('Pending docs: {:,}'.format(pending_docs)) logger.info('Indexing completed') def wait_for_secindex_init_build(self, host, indexes): # POLL until initial index build is complete logger.info("Waiting for the following indexes to be ready: {}".format( indexes)) indexes_ready = [0 for _ in indexes] def get_index_status(json2i, index): """Return the index status.""" for d in json2i["status"]: if d["name"] == index: return d["status"] return None @misc.retry(catch=(KeyError, ), iterations=10, wait=30) def update_indexes_ready(): json2i = self.get_index_status(host) for i, index in enumerate(indexes): status = get_index_status(json2i, index) if status == 'Ready': indexes_ready[i] = 1 init_ts = time.time() while sum(indexes_ready) != len(indexes): time.sleep(self.POLLING_INTERVAL_INDEXING) update_indexes_ready() finish_ts = time.time() logger.info('secondary index build time: {}'.format(finish_ts - init_ts)) time_elapsed = round(finish_ts - init_ts) return time_elapsed def wait_for_secindex_incr_build(self, index_nodes, bucket, indexes, numitems): # POLL until incremenal index build is complete logger.info('expecting {} num_docs_indexed for indexes {}'.format( numitems, indexes)) # collect num_docs_indexed information globally from all index nodes def get_num_docs_indexed(): data = self.get_index_stats(index_nodes) num_indexed = [] for index in indexes: key = "" + bucket + ":" + index + ":num_docs_indexed" val = data[key] num_indexed.append(val) return num_indexed def get_num_docs_index_pending(): data = self.get_index_stats(index_nodes) num_pending = [] for index in indexes: key = "" + bucket + ":" + index + ":num_docs_pending" val1 = data[key] key = "" + bucket + ":" + index + ":num_docs_queued" val2 = data[key] val = int(val1) + int(val2) num_pending.append(val) return num_pending expected_num_pending = [0] * len(indexes) while True: time.sleep(self.POLLING_INTERVAL_INDEXING) curr_num_pending = get_num_docs_index_pending() if curr_num_pending == expected_num_pending: break curr_num_indexed = get_num_docs_indexed() logger.info("Number of Items indexed {}".format(curr_num_indexed)) def wait_for_num_connections(self, index_node, expected_connections): curr_connections = self.get_index_num_connections(index_node) retry = 1 while curr_connections < expected_connections and retry < self.MAX_RETRY: time.sleep(self.POLLING_INTERVAL_INDEXING) curr_connections = self.get_index_num_connections(index_node) logger.info("Got current connections {}".format(curr_connections)) retry += 1 if retry == self.MAX_RETRY: return False return True def wait_for_recovery(self, index_nodes, bucket, index): time.sleep(self.MONITORING_DELAY) for retry in range(self.MAX_RETRY_RECOVERY): response = self.get_index_stats(index_nodes) item = "{}:{}:disk_load_duration".format(bucket, index) if item in response: return response[item] else: time.sleep(self.POLLING_INTERVAL) return -1 def wait_for_servers(self): for retry in range(self.MAX_RETRY): logger.info('Waiting for all servers to be available') time.sleep(self.POLLING_INTERVAL_MACHINE_UP) for server in self.cluster_spec.servers: if not self.remote.is_up(server): break else: logger.info('All nodes are up') return logger.interrupt('Some nodes are still down') def monitor_fts_indexing_queue(self, host: str, index: str, items: int): logger.info('Waiting for indexing to finish') count = 0 while count < items: count = self.get_fts_doc_count(host, index) logger.info('FTS indexed documents: {:,}'.format(count)) time.sleep(self.POLLING_INTERVAL) def monitor_fts_index_persistence(self, hosts: list, index: str): logger.info('Waiting for index to be persisted') pending_items = 1 while pending_items: persist = 0 compact = 0 for host in hosts: stats = self.get_fts_stats(host) metric = '{}:{}:{}'.format(self.test_config.buckets[0], index, 'num_recs_to_persist') persist += stats[metric] metric = '{}:{}:{}'.format(self.test_config.buckets[0], index, 'total_compactions') compact += stats[metric] pending_items = persist or compact logger.info('Records to persist: {:,}'.format(persist)) logger.info('Ongoing compactions: {:,}'.format(compact)) time.sleep(self.POLLING_INTERVAL) def monitor_elastic_indexing_queue(self, host: str, index: str): logger.info(' Waiting for indexing to finish') items = int(self.test_config.fts_settings.test_total_docs) count = 0 while count < items: count = self.get_elastic_doc_count(host, index) logger.info('Elasticsearch indexed documents: {:,}'.format(count)) time.sleep(self.POLLING_INTERVAL) def monitor_elastic_index_persistence(self, host: str, index: str): logger.info('Waiting for index to be persisted') pending_items = -1 while pending_items: stats = self.get_elastic_stats(host) pending_items = stats['indices'][index]['total']['translog'][ 'operations'] logger.info('Records to persist: {:,}'.format(pending_items)) time.sleep(self.POLLING_INTERVAL) def wait_for_bootstrap(self, nodes: list, function: str): logger.info( 'Waiting for bootstrap of eventing function: {} '.format(function)) for node in nodes: retry = 1 while retry < self.MAX_RETRY_BOOTSTRAP: if function in self.get_deployed_apps(node): break time.sleep(self.POLLING_INTERVAL) retry += 1 if retry == self.MAX_RETRY_BOOTSTRAP: logger.info( 'Failed to bootstrap function: {}, node: {}'.format( function, node)) def get_num_analytics_items(self, data_node: str, bucket: str) -> int: stats_key = '{}:all:incoming_records_count_total'.format(bucket) num_items = 0 for node in self.get_active_nodes_by_role(data_node, 'cbas'): stats = self.get_analytics_stats(node) num_items += stats[stats_key] return num_items def monitor_data_synced(self, data_node: str, bucket: str) -> int: logger.info('Waiting for data to be synced from {}'.format(data_node)) num_items = self._get_num_items(data_node, bucket) while True: num_analytics_items = self.get_num_analytics_items( data_node, bucket) if num_analytics_items == num_items: break logger.info('Analytics has {:,} docs (target is {:,})'.format( num_analytics_items, num_items)) time.sleep(self.POLLING_INTERVAL_ANALYTICS) return num_items def wait_for_timer_event(self, node: str, function: str, event="DOC_TIMER_EVENTS"): logger.info('Waiting for timer events to start processing: {} '.format( function)) retry = 1 while retry < self.MAX_RETRY_TIMER_EVENT: if 0 < self.get_num_events_processed( event=event, node=node, name=function): break time.sleep(self.POLLING_INTERVAL_EVENTING) retry += 1 if retry == self.MAX_RETRY_TIMER_EVENT: logger.info( 'Failed to get timer event for function: {}'.format(function)) def wait_for_all_mutations_processed(self, host: str, bucket1: str, bucket2: str): logger.info( 'Waiting for mutations to be processed of eventing function') retry = 1 while retry < self.MAX_RETRY_BOOTSTRAP: if self._get_num_items(host=host, bucket=bucket1) == \ self._get_num_items(host=host, bucket=bucket2): break retry += 1 time.sleep(self.POLLING_INTERVAL_EVENTING) if retry == self.MAX_RETRY_BOOTSTRAP: logger.info('Failed to process all mutations... TIMEOUT')
class ClusterManager(object): def __init__(self, cluster_spec, test_config): self.cluster_spec = cluster_spec self.test_config = test_config self.rest = RestHelper(cluster_spec) self.remote = RemoteHelper(cluster_spec) self.monitor = Monitor(cluster_spec) self.memcached = MemcachedHelper(cluster_spec) self.clusters = cluster_spec.yield_clusters() self.servers = cluster_spec.yield_servers self.masters = cluster_spec.yield_masters self.hostnames = cluster_spec.yield_hostnames self.initial_nodes = test_config.cluster.initial_nodes self.mem_quota = test_config.cluster.mem_quota self.group_number = test_config.cluster.group_number or 1 def set_data_path(self): data_path, index_path = self.cluster_spec.paths for server in self.servers(): self.rest.set_data_path(server, data_path, index_path) def set_auth(self): for server in self.servers(): self.rest.set_auth(server) def set_mem_quota(self): for server in self.servers(): self.rest.set_mem_quota(server, self.mem_quota) def disable_moxi(self): if self.test_config.cluster.disable_moxi is not None: self.remote.disable_moxi() def create_server_groups(self): for master in self.masters(): for i in range(1, self.group_number): name = 'Group {}'.format(i + 1) self.rest.create_server_group(master, name=name) def add_nodes(self): for (_, servers), initial_nodes in zip(self.clusters, self.initial_nodes): if initial_nodes < 2: # Single-node cluster continue # Adding initial nodes master = servers[0] if self.group_number > 1: groups = self.rest.get_server_groups(master) else: groups = {} for i, host_port in enumerate(servers[1:initial_nodes], start=1): host = host_port.split(':')[0] uri = groups.get( server_group(servers[:initial_nodes], self.group_number, i)) self.rest.add_node(master, host, uri) # Rebalance master = servers[0] known_nodes = servers[:initial_nodes] ejected_nodes = [] self.rest.rebalance(master, known_nodes, ejected_nodes) self.monitor.monitor_rebalance(master) def create_buckets(self): ram_quota = self.mem_quota / self.test_config.cluster.num_buckets replica_number = self.test_config.bucket.replica_number replica_index = self.test_config.bucket.replica_index eviction_policy = self.test_config.bucket.eviction_policy threads_number = self.test_config.bucket.threads_number for master in self.masters(): for bucket_name in self.test_config.buckets: self.rest.create_bucket( host_port=master, name=bucket_name, ram_quota=ram_quota, replica_number=replica_number, replica_index=replica_index, eviction_policy=eviction_policy, threads_number=threads_number, ) def configure_auto_compaction(self): compaction_settings = self.test_config.compaction for master in self.masters(): self.rest.configure_auto_compaction(master, compaction_settings) def configure_internal_settings(self): internal_settings = self.test_config.internal_settings for master in self.masters(): for parameter, value in internal_settings.items(): self.rest.set_internal_settings(master, {parameter: int(value)}) def tweak_memory(self): self.remote.reset_swap() self.remote.drop_caches() self.remote.set_swappiness() self.remote.disable_thp() def restart_with_alternative_num_vbuckets(self): num_vbuckets = self.test_config.cluster.num_vbuckets if num_vbuckets is not None: self.remote.restart_with_alternative_num_vbuckets(num_vbuckets) def restart_with_alternative_bucket_options(self): cmd = 'ns_bucket:update_bucket_props("{}", ' \ '[{{extra_config_string, "{}={}"}}]).' for option in ('max_num_shards', 'max_threads'): value = getattr(self.test_config.bucket, option) if value: logger.info('Changing {} to {}'.format(option, value)) for master in self.masters(): for bucket in self.test_config.buckets: diag_eval = cmd.format(bucket, option, value) self.rest.run_diag_eval(master, diag_eval) self.remote.restart() def restart_with_alternative_num_cpus(self): num_cpus = self.test_config.cluster.num_cpus if num_cpus: self.remote.restart_with_alternative_num_cpus(num_cpus) def enable_auto_failover(self): for master in self.masters(): self.rest.enable_auto_failover(master) def wait_until_warmed_up(self): target_iterator = TargetIterator(self.cluster_spec, self.test_config) for target in target_iterator: host = target.node.split(':')[0] self.monitor.monitor_warmup(self.memcached, host, target.bucket) def change_watermarks(self): watermark_settings = self.test_config.watermark_settings for hostname, initial_nodes in zip(self.hostnames(), self.initial_nodes): for bucket in self.test_config.buckets: for key, val in watermark_settings.items(): val = self.memcached.calc_watermark(val, self.mem_quota) self.memcached.set_flusher_param(hostname, bucket, key, val) def start_cbq_engine(self): if self.test_config.cluster.run_cbq: self.remote.start_cbq()