Example #1
0
def main():
    args = get_args()

    cluster_spec = ClusterSpec()
    cluster_spec.parse(args.cluster)

    remote = RemoteHelper(cluster_spec, args.verbose)
    remote.get_manifest()
    projects = parse_manifest()
    fetch(projects)
Example #2
0
def main():
    args = get_args()

    cluster_spec = ClusterSpec()
    cluster_spec.parse(args.cluster_spec_fname)

    remote = RemoteHelper(cluster_spec, verbose=False)

    logger.info('Recovering system state')
    for host, version in remote.get_system_backup_version().items():
        remote.start_system_state_recovery(host, version)
Example #3
0
def main():
    options, args = get_options()

    cluster_spec = ClusterSpec()
    cluster_spec.parse(options.cluster_spec_fname, args)

    remote = RemoteHelper(cluster_spec, test_config=None, verbose=False)

    logger.info('Recovering system state')
    for host, version in remote.get_system_backup_version().items():
        remote.start_system_state_recovery(host, version)
Example #4
0
def main():
    options, args = get_options()

    cluster_spec = ClusterSpec()
    cluster_spec.parse(options.cluster_spec_fname, args)

    remote = RemoteHelper(cluster_spec, test_config=None, verbose=False)

    remote.collect_info()
    for hostname in cluster_spec.yield_hostnames():
        for fname in glob.glob('{}/*.zip'.format(hostname)):
            shutil.move(fname, '{}.zip'.format(hostname))
Example #5
0
class MongoDBInstaller(CouchbaseInstaller):

    URL = 'http://fastdl.mongodb.org/linux/mongodb-linux-x86_64-2.6.1.tgz'

    def __init__(self, cluster_spec, options):
        self.remote = RemoteHelper(cluster_spec, None, options.verbose)

    def uninstall_package(self):
        pass

    def clean_data(self):
        self.remote.clean_mongodb()

    def install_package(self):
        self.remote.install_mongodb(url=self.URL)
Example #6
0
    def __init__(self,
                 cluster_spec: ClusterSpec,
                 test_config: TestConfig,
                 verbose: bool):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.target_iterator = TargetIterator(cluster_spec, test_config)

        self.cluster = ClusterManager(cluster_spec, test_config)
        self.memcached = MemcachedHelper(test_config)
        self.monitor = Monitor(cluster_spec, test_config, verbose)
        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec, verbose)
        self.profiler = Profiler(cluster_spec, test_config)

        self.master_node = next(cluster_spec.masters)
        self.build = self.rest.get_version(self.master_node)

        self.metrics = MetricHelper(self)
        self.reporter = ShowFastReporter(cluster_spec, test_config, self.build)

        self.cbmonitor_snapshots = []
        self.cbmonitor_clusters = []

        if self.test_config.test_case.use_workers:
            self.worker_manager = WorkerManager(cluster_spec, test_config,
                                                verbose)
Example #7
0
    def __init__(self, cluster_spec, test_config, experiment=None):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.target_iterator = TargetIterator(self.cluster_spec,
                                              self.test_config)

        self.memcached = MemcachedHelper(cluster_spec)
        self.monitor = Monitor(cluster_spec)
        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec)

        if experiment:
            self.experiment = ExperimentHelper(experiment,
                                               cluster_spec, test_config)

        self.master_node = cluster_spec.yield_masters().next()
        self.build = self.rest.get_version(self.master_node)

        self.cbagent = CbAgent(self)
        self.metric_helper = MetricHelper(self)
        self.reporter = Reporter(self)
        self.reports = {}
        self.snapshots = []
        self.master_events = []

        if self.test_config.test_case.use_workers:
            self.worker_manager = WorkerManager(cluster_spec, test_config)
Example #8
0
 def __init__(self,
              cluster_spec: ClusterSpec,
              test_config: TestConfig,
              verbose: bool = False):
     self.cluster_spec = cluster_spec
     self.test_config = test_config
     self.dynamic_infra = self.cluster_spec.dynamic_infrastructure
     self.rest = RestHelper(cluster_spec, test_config)
     self.remote = RemoteHelper(cluster_spec, verbose)
     self.monitor = Monitor(cluster_spec, test_config, verbose)
     self.memcached = MemcachedHelper(test_config)
     self.master_node = next(self.cluster_spec.masters)
     if self.dynamic_infra:
         self.initial_nodes = None
     else:
         self.initial_nodes = test_config.cluster.initial_nodes
     self.build = self.rest.get_version(self.master_node)
Example #9
0
def main():
    args = get_args()

    cluster_spec = ClusterSpec()
    cluster_spec.parse(args.cluster_spec_fname)

    remote = RemoteHelper(cluster_spec, test_config=None, verbose=False)

    remote.collect_info()
    for hostname in cluster_spec.servers:
        for fname in glob.glob('{}/*.zip'.format(hostname)):
            shutil.move(fname, '{}.zip'.format(hostname))

    if cluster_spec.backup is not None:
        logs = os.path.join(cluster_spec.backup, 'logs')
        if os.path.exists(logs):
            shutil.make_archive('tools', 'zip', logs)
Example #10
0
    def __init__(self, cluster_spec, test_config):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec)
        self.monitor = Monitor(cluster_spec)
        self.memcached = MemcachedHelper(cluster_spec)

        self.clusters = cluster_spec.yield_clusters()
        self.servers = cluster_spec.yield_servers
        self.masters = cluster_spec.yield_masters
        self.hostnames = cluster_spec.yield_hostnames

        self.initial_nodes = test_config.cluster.initial_nodes
        self.mem_quota = test_config.cluster.mem_quota
        self.group_number = test_config.cluster.group_number or 1
Example #11
0
def main():
    args = get_args()

    cluster_spec = ClusterSpec()
    cluster_spec.parse(args.cluster_spec_fname)

    remote = RemoteHelper(cluster_spec, verbose=False)

    remote.collect_info()
    for hostname in cluster_spec.servers:
        for fname in glob.glob('{}/*.zip'.format(hostname)):
            shutil.move(fname, '{}.zip'.format(hostname))

    if cluster_spec.backup is not None:
        logs = os.path.join(cluster_spec.backup, 'logs')
        if os.path.exists(logs):
            shutil.make_archive('tools', 'zip', logs)
Example #12
0
    def __init__(self, test, verbose):
        self.clusters = OrderedDict()
        self.remote = RemoteHelper(test.cluster_spec, test.test_config,
                                   verbose=verbose)

        for cluster_name, servers in test.cluster_spec.yield_clusters():
            cluster = '{}_{}_{}'.format(cluster_name,
                                        test.build.replace('.', ''),
                                        uhex()[:3])
            master = servers[0].split(':')[0]
            self.clusters[cluster] = master
        if test.test_config.test_case.monitor_clients:
            for node in test.cluster_spec.workers:
                cluster = '{}{}'.format(self.clusters.items()[0][0][:-3], uhex()[:3])
                master = node.split(':')[0]
                self.clusters[cluster] = master

        self.index_node = ''
        for _, servers in test.cluster_spec.yield_servers_by_role('index'):
            if servers:
                self.index_node = servers[0].split(':')[0]

        if hasattr(test, 'ALL_BUCKETS'):
            buckets = None
        else:
            buckets = test.test_config.buckets[:1]
        if hasattr(test, 'ALL_HOSTNAMES'):
            hostnames = tuple(test.cluster_spec.yield_hostnames())
        else:
            hostnames = None

        self.settings = type('settings', (object,), {
            'seriesly_host': test.test_config.stats_settings.seriesly['host'],
            'cbmonitor_host_port': test.test_config.stats_settings.cbmonitor['host'],
            'interval': test.test_config.stats_settings.interval,
            'secondary_statsfile': test.test_config.stats_settings.secondary_statsfile,
            'buckets': buckets,
            'hostnames': hostnames,
            'sync_gateway_nodes':
                test.remote.gateways if test.remote else None,
            'monitor_clients':
                test.cluster_spec.workers if test.test_config.test_case.monitor_clients else None,
            'fts_server': test.test_config.test_case.fts_server
        })()
        self.lat_interval = test.test_config.stats_settings.lat_interval
        if test.cluster_spec.ssh_credentials:
            self.settings.ssh_username, self.settings.ssh_password = \
                test.cluster_spec.ssh_credentials
        self.settings.rest_username, self.settings.rest_password = \
            test.cluster_spec.rest_credentials
        self.settings.bucket_password = test.test_config.bucket.password

        self.settings.index_node = self.index_node

        self.collectors = []
        self.processes = []
        self.snapshots = []
        self.fts_stats = None
Example #13
0
    def __init__(self, cluster_spec, test_config, verbose):
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.verbose = verbose

        self.snapshot = self.test_config.restore_settings.snapshot

        self.remote = RemoteHelper(self.cluster_spec, self.test_config,
                                   self.verbose)
Example #14
0
    def __init__(self, cluster_spec, options):
        self.remote_helper = RemoteHelper(cluster_spec)
        self.cluster_spec = cluster_spec

        arch = self.remote_helper.detect_arch()
        pkg = self.remote_helper.detect_pkg()
        openssl = self.remote_helper.detect_openssl(pkg)

        self.build = Build(arch, pkg, options.version, openssl, options.toy)
        logger.info('Target build info: {}'.format(self.build))
Example #15
0
 def calc_network_bandwidth(self):
     self.remote = RemoteHelper(self.cluster_spec, self.test_config, verbose=True)
     for cluster_name, servers in self.cluster_spec.yield_clusters():
         self.in_bytes_transfer += [self.remote.read_bandwidth_stats("to", servers)]
         self.out_bytes_transfer += [self.remote.read_bandwidth_stats("from", servers)]
     logger.info('in bytes', self.in_bytes_transfer)
     logger.info('out bytes', self.out_bytes_transfer)
     return OrderedDict((
         ('in bytes', sum(self.in_bytes_transfer[0].values())),
         ('out bytes', sum(self.out_bytes_transfer[0].values()))))
Example #16
0
    def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig,
                 verbose: bool):
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.remote = RemoteHelper(cluster_spec, verbose)

        self.workers = cycle(self.cluster_spec.workers)

        self.terminate()
        self.start()
        self.wait_until_workers_are_ready()
Example #17
0
    def __init__(self, test):
        self.clusters = OrderedDict()
        self.remote = RemoteHelper(test.cluster_spec, test.test_config, verbose=True)

        for cluster_name, servers in test.cluster_spec.yield_clusters():
            cluster = "{}_{}_{}".format(cluster_name, test.build.replace(".", ""), uhex()[:3])
            master = servers[0].split(":")[0]
            self.clusters[cluster] = master
        if test.test_config.test_case.monitor_clients:
            for node in test.cluster_spec.workers:
                cluster = "{}{}".format(self.clusters.items()[0][0][:-3], uhex()[:3])
                master = node.split(":")[0]
                self.clusters[cluster] = master

        self.index_node = ""
        for _, servers in test.cluster_spec.yield_servers_by_role("index"):
            if servers:
                self.index_node = servers[0].split(":")[0]

        if hasattr(test, "ALL_BUCKETS"):
            buckets = None
        else:
            buckets = test.test_config.buckets[:1]
        if hasattr(test, "ALL_HOSTNAMES"):
            hostnames = tuple(test.cluster_spec.yield_hostnames())
        else:
            hostnames = None

        self.settings = type(
            "settings",
            (object,),
            {
                "seriesly_host": test.test_config.stats_settings.seriesly["host"],
                "cbmonitor_host_port": test.test_config.stats_settings.cbmonitor["host"],
                "interval": test.test_config.stats_settings.interval,
                "secondary_statsfile": test.test_config.stats_settings.secondary_statsfile,
                "buckets": buckets,
                "hostnames": hostnames,
                "sync_gateway_nodes": test.remote.gateways if test.remote else None,
                "monitor_clients": test.cluster_spec.workers if test.test_config.test_case.monitor_clients else None,
            },
        )()
        self.lat_interval = test.test_config.stats_settings.lat_interval
        if test.cluster_spec.ssh_credentials:
            self.settings.ssh_username, self.settings.ssh_password = test.cluster_spec.ssh_credentials
        self.settings.rest_username, self.settings.rest_password = test.cluster_spec.rest_credentials
        self.settings.bucket_password = test.test_config.bucket.password

        self.settings.index_node = self.index_node

        self.collectors = []
        self.processes = []
        self.snapshots = []
        self.bandwidth = False
Example #18
0
 def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig,
              verbose: bool):
     self.cluster_spec = cluster_spec
     self.test_config = test_config
     self.memcached = MemcachedHelper(test_config)
     self.remote = RemoteHelper(cluster_spec, test_config, verbose)
     self.rest = RestHelper(cluster_spec)
     # self.build = os.environ.get('SGBUILD') or "0.0.0-000"
     self.master_node = next(cluster_spec.masters)
     self.build = self.rest.get_sgversion(self.master_node)
     self.metrics = MetricHelper(self)
     self.reporter = ShowFastReporter(cluster_spec, test_config, self.build)
     if self.test_config.test_case.use_workers:
         self.worker_manager = WorkerManager(cluster_spec, test_config,
                                             verbose)
     self.settings = self.test_config.access_settings
     self.settings.syncgateway_settings = self.test_config.syncgateway_settings
     self.profiler = Profiler(cluster_spec, test_config)
     self.cluster = ClusterManager(cluster_spec, test_config)
     self.target_iterator = TargetIterator(cluster_spec, test_config)
     self.monitor = Monitor(cluster_spec, test_config, verbose)
Example #19
0
    def __init__(self, cluster_spec, options):
        self.remote = RemoteHelper(cluster_spec, None, options.verbose)
        self.cluster_spec = cluster_spec

        arch = self.remote.detect_arch()
        pkg = self.remote.detect_pkg()
        release, build = options.version.split('-')
        self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/sherlock/{}/'.format(build)

        self.build = Build(arch, pkg, options.version, release, build,
                           options.toy)
        logger.info('Target build info: {}'.format(self.build))
Example #20
0
def main():
    args = get_args()

    cluster_spec = ClusterSpec()
    cluster_spec.parse(args.cluster_spec_fname)

    remote = RemoteHelper(cluster_spec, verbose=False)

    remote.collect_info()

    for hostname in cluster_spec.servers:
        for fname in glob.glob('{}/*.zip'.format(hostname)):
            shutil.move(fname, '{}.zip'.format(hostname))

    if cluster_spec.backup is not None:
        logs = os.path.join(cluster_spec.backup, 'logs')
        if os.path.exists(logs):
            shutil.make_archive('tools', 'zip', logs)

    failures = defaultdict(dict)

    for file_name in glob.iglob('./*.zip'):
        panic_files, crash_files, storage_corrupted = validate_logs(file_name)
        if panic_files:
            failures['panics'][file_name] = panic_files
        if crash_files:
            failures['crashes'][file_name] = crash_files
        if storage_corrupted:
            failures['storage_corrupted'][file_name] = True
            remote.collect_index_datafiles()

    if failures:
        logger.interrupt(
            "Following failures found: {}".format(pretty_dict(failures)))
Example #21
0
    def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig,
                 verbose: bool = False):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec, verbose)
        self.monitor = Monitor(cluster_spec, test_config, verbose)
        self.memcached = MemcachedHelper(test_config)

        self.master_node = next(self.cluster_spec.masters)

        self.initial_nodes = test_config.cluster.initial_nodes
Example #22
0
    def __init__(self, cluster_spec, options):
        self.options = options
        self.remote = RemoteHelper(cluster_spec, None, options.verbose)
        self.cluster_spec = cluster_spec

        arch = self.remote.detect_arch()
        pkg = self.remote.detect_pkg()

        release = None
        build = None
        if options.version:
            release, build = options.version.split('-')
            self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/sherlock/{}/'.format(
                build)
            self.WATSON_BUILDS = 'http://172.23.120.24/builds/latestbuilds/couchbase-server/watson/{}/'.format(
                build)
            if options.toy:
                self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/toy-{}/{}/'.format(
                    options.toy, build)
        self.build = Build(arch, pkg, options.cluster_edition, options.version,
                           release, build, options.toy, options.url)
        logger.info('Target build info: {}'.format(self.build))
Example #23
0
    def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig,
                 verbose: bool):
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.dynamic_infra = self.cluster_spec.dynamic_infrastructure
        self.target_iterator = TargetIterator(cluster_spec, test_config)
        self.cluster = ClusterManager(cluster_spec, test_config)
        self.remote = RemoteHelper(cluster_spec, verbose)
        self.profiler = Profiler(cluster_spec, test_config)
        self.master_node = next(cluster_spec.masters)
        self.memcached = MemcachedHelper(test_config)
        self.monitor = Monitor(cluster_spec, test_config, verbose)
        self.rest = RestHelper(cluster_spec)
        self.build = self.rest.get_version(self.master_node)
        self.metrics = MetricHelper(self)
        self.reporter = ShowFastReporter(cluster_spec, test_config, self.build)

        self.cbmonitor_snapshots = []
        self.cbmonitor_clusters = []

        if self.test_config.test_case.use_workers:
            self.worker_manager = WorkerManager(cluster_spec, test_config,
                                                verbose)
Example #24
0
class FIOTest(PerfTest):

    TRACKER = 'fio.sc.couchbase.com'

    TEMPLATE = {
        'group': '{}, random mixed reads and writes, IOPS',
        'metric': None,
        'value': None,
    }

    def __init__(self, cluster_spec, test_config, verbose):
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.remote = RemoteHelper(cluster_spec, test_config, verbose)

    def __exit__(self, *args, **kwargs):
        pass

    @staticmethod
    def _parse(results):
        """Parse the test output.

        See also https://github.com/axboe/fio/blob/master/HOWTO
        """
        stats = defaultdict(int)
        for host, output in results.items():
            for job in output.split():
                stats[host] += int(job.split(';')[7])  # reads
                stats[host] += int(job.split(';')[48])  # writes
        return stats

    def _post(self, data):
        data = pretty_dict(data)
        logger.info('Posting: {}'.format(data))
        requests.post('http://{}/api/v1/benchmarks'.format(self.TRACKER),
                      data=data)

    def _report_kpi(self, stats):
        for host, iops in stats.items():
            data = self.TEMPLATE.copy()
            data['group'] = data['group'].format(
                self.cluster_spec.name.title())
            data['metric'] = host
            data['value'] = iops

            self._post(data)

    def run(self):
        stats = self.remote.fio(self.test_config.fio['config'])
        self._report_kpi(self._parse(stats))
Example #25
0
    def download_local(self):
        """Download and save a copy of the specified package."""
        try:
            if RemoteHelper.detect_server_os("127.0.0.1").upper() in ('UBUNTU', 'DEBIAN'):
                os_release = detect_ubuntu_release()

                url = self.find_package(edition=self.options.edition,
                                        package="deb", os_release=os_release)
                logger.info('Saving a local copy of {}'.format(url))
                with open('couchbase.deb', 'wb') as fh:
                    resp = requests.get(url)
                    fh.write(resp.content)
        except (Exception, BaseException):
            logger.info("Saving local copy for ubuntu failed, package may not present")
Example #26
0
class FIOTest(PerfTest):

    TRACKER = 'fio.sc.couchbase.com'

    TEMPLATE = {
        'group': '{}, random mixed reads and writes, IOPS',
        'metric': None,
        'value': None,
    }

    def __init__(self, cluster_spec, test_config, verbose):
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.remote = RemoteHelper(cluster_spec, test_config, verbose)

    def __exit__(self, *args, **kwargs):
        pass

    @staticmethod
    def _parse(results):
        """Terse output parsing is based on the following guide:

            https://github.com/axboe/fio/blob/master/HOWTO
        """
        stats = defaultdict(int)
        for host, output in results.items():
            for job in output.split():
                stats[host] += int(job.split(';')[7])  # reads
                stats[host] += int(job.split(';')[48])  # writes
        return stats

    def _post(self, data):
        data = pretty_dict(data)
        logger.info('Posting: {}'.format(data))
        requests.post('http://{}/api/v1/benchmarks'.format(self.TRACKER),
                      data=data)

    def _report_kpi(self, stats):
        for host, iops in stats.items():
            data = self.TEMPLATE.copy()
            data['group'] = data['group'].format(self.cluster_spec.name.title())
            data['metric'] = host
            data['value'] = iops

            self._post(data)

    def run(self):
        stats = self.remote.fio(self.test_config.fio['config'])
        self._report_kpi(self._parse(stats))
Example #27
0
    def __init__(self, *args, **kwargs):
        options, args = get_options()

        self.cluster_spec = ClusterSpec()
        self.cluster_spec.parse(options.cluster_spec_fname, args)
        self.test_config = TestConfig()
        self.test_config.parse(options.test_config_fname, args)

        self.target_iterator = TargetIterator(self.cluster_spec,
                                              self.test_config)
        self.memcached = MemcachedHelper(self.test_config)
        self.remote = RemoteHelper(self.cluster_spec, self.test_config)
        self.rest = RestHelper(self.cluster_spec)

        super(FunctionalTest, self).__init__(*args, **kwargs)
Example #28
0
 def __init__(self, cluster_spec, options):
     self.options = options
     self.cluster_spec = cluster_spec
     self.operator_version = self.options.operator_version
     self.couchbase_version = self.options.couchbase_version
     self.node_count = len(self.cluster_spec.infrastructure_clusters['couchbase1'].split())
     self.remote = RemoteHelper(cluster_spec)
     self.release = self.operator_version.split("-")[0]
     self.build = self.operator_version.split("-")[1]
     self.docker_config_path = os.path.expanduser("~") + "/.docker/config.json"
     self.operator_base_path = "cloud/operator/{}/{}".format(self.release[0], self.release[2])
     self.certificate_authority_path = "{}/ca.crt".format(self.operator_base_path)
     self.crd_path = "{}/crd.yaml".format(self.operator_base_path)
     self.config_path = "{}/config.yaml".format(self.operator_base_path)
     self.config_template_path = "{}/config_template.yaml".format(self.operator_base_path)
     self.auth_path = "{}/auth_secret.yaml".format(self.operator_base_path)
     self.cb_cluster_path = "{}/couchbase-cluster.yaml".format(self.operator_base_path)
     self.template_cb_cluster_path = "{}/couchbase-cluster_template.yaml"\
         .format(self.operator_base_path)
     self.worker_base_path = "cloud/worker"
     self.worker_path = "{}/worker.yaml".format(self.worker_base_path)
     self.rmq_base_path = "cloud/broker/rabbitmq/0.48"
     self.rmq_operator_path = "{}/cluster-operator.yaml".format(self.rmq_base_path)
     self.rmq_cluster_path = "{}/rabbitmq.yaml".format(self.rmq_base_path)
Example #29
0
    def __init__(self, cluster_spec, test_config, verbose):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec, test_config, verbose)
        self.monitor = Monitor(cluster_spec)
        self.memcached = MemcachedHelper(test_config)

        self.clusters = cluster_spec.yield_clusters()
        self.servers = cluster_spec.yield_servers
        self.masters = cluster_spec.yield_masters

        self.initial_nodes = test_config.cluster.initial_nodes
        self.mem_quota = test_config.cluster.mem_quota
        self.group_number = test_config.cluster.group_number or 1
Example #30
0
    def __init__(self, *args, **kwargs):
        options, _args = get_options()
        override = \
            _args and (arg.split('.') for arg in ' '.join(_args).split(','))

        self.cluster_spec = ClusterSpec()
        self.cluster_spec.parse(options.cluster_spec_fname)
        self.test_config = TestConfig()
        self.test_config.parse(options.test_config_fname, override)

        self.target_iterator = TargetIterator(self.cluster_spec,
                                              self.test_config)
        self.memcached = MemcachedHelper(self.cluster_spec)
        self.remote = RemoteHelper(self.cluster_spec)

        super(FunctionalTest, self).__init__(*args, **kwargs)
Example #31
0
    def __init__(self, cluster_spec, options):
        self.options = options
        self.remote = RemoteHelper(cluster_spec, None, options.verbose)
        self.cluster_spec = cluster_spec

        arch = self.remote.detect_arch()
        pkg = self.remote.detect_pkg()

        release = None
        build = None
        if options.version:
            release, build = options.version.split('-')

        self.build = Build(arch, pkg, options.cluster_edition, options.version,
                           release, build, options.url)
        logger.info('Target build info: {}'.format(self.build))
Example #32
0
    def __init__(self, cluster_spec, options):
        self.options = options
        self.remote = RemoteHelper(cluster_spec, None, options.verbose)
        self.cluster_spec = cluster_spec

        arch = self.remote.detect_arch()
        pkg = self.remote.detect_pkg()

        release = None
        build = None
        if options.version:
            release, build = options.version.split('-')
            self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/sherlock/{}/'.format(build)
            self.WATSON_BUILDS = 'http://172.23.120.24/builds/latestbuilds/couchbase-server/watson/{}/'.format(build)
            if options.toy:
                self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/toy-{}/{}/'.format(options.toy, build)
        self.build = Build(arch, pkg, options.cluster_edition, options.version,
                           release, build, options.toy, options.url)
        logger.info('Target build info: {}'.format(self.build))
Example #33
0
class Monitor(RestHelper):

    MAX_RETRY = 150
    MAX_RETRY_RECOVERY = 1200
    MAX_RETRY_TIMER_EVENT = 18000
    MAX_RETRY_BOOTSTRAP = 1200

    MONITORING_DELAY = 5

    POLLING_INTERVAL = 2
    POLLING_INTERVAL_INDEXING = 1
    POLLING_INTERVAL_MACHINE_UP = 10
    POLLING_INTERVAL_ANALYTICS = 15
    POLLING_INTERVAL_EVENTING = 1

    REBALANCE_TIMEOUT = 3600 * 6
    TIMEOUT = 3600 * 12

    DISK_QUEUES = (
        'ep_queue_size',
        'ep_flusher_todo',
        'ep_diskqueue_items',
        'vb_active_queue_size',
        'vb_replica_queue_size',
    )

    DCP_QUEUES = (
        'ep_dcp_replica_items_remaining',
        'ep_dcp_other_items_remaining',
    )

    XDCR_QUEUES = (
        'replication_changes_left',
    )

    def __init__(self, cluster_spec, test_config, verbose):
        super().__init__(cluster_spec=cluster_spec)
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.remote = RemoteHelper(cluster_spec, verbose)

    def monitor_rebalance(self, host):
        logger.info('Monitoring rebalance status')

        is_running = True
        last_progress = 0
        last_progress_time = time.time()
        while is_running:
            time.sleep(self.POLLING_INTERVAL)

            is_running, progress = self.get_task_status(host,
                                                        task_type='rebalance')
            if progress == last_progress:
                if time.time() - last_progress_time > self.REBALANCE_TIMEOUT:
                    logger.error('Rebalance hung')
                    break
            else:
                last_progress = progress
                last_progress_time = time.time()

            if progress is not None:
                logger.info('Rebalance progress: {} %'.format(progress))

        logger.info('Rebalance completed')

    def _wait_for_empty_queues(self, host, bucket, queues, stats_function):
        metrics = list(queues)

        start_time = time.time()
        while metrics:
            bucket_stats = stats_function(host, bucket)
            # As we are changing metrics in the loop; take a copy of it to
            # iterate over.
            for metric in list(metrics):
                stats = bucket_stats['op']['samples'].get(metric)
                if stats:
                    last_value = stats[-1]
                    if last_value:
                        logger.info('{} = {:,}'.format(metric, last_value))
                        continue
                    else:
                        logger.info('{} reached 0'.format(metric))
                    metrics.remove(metric)
            if metrics:
                time.sleep(self.POLLING_INTERVAL)
            if time.time() - start_time > self.TIMEOUT:
                raise Exception('Monitoring got stuck')

    def _wait_for_replication_completion(self, host, bucket, queues, stats_function, link1, link2):
        metrics = list(queues)

        completion_count = 0
        link1_time = 0
        link2_items = 0

        link1_compelteness_str = \
            'replications/{}/bucket-1/bucket-1/percent_completeness'.format(link1)
        link2_compelteness_str = \
            'replications/{}/bucket-1/bucket-1/percent_completeness'.format(link2)
        link2_items_str = \
            'replications/{}/bucket-1/bucket-1/docs_written'.format(link2)

        start_time = time.time()

        while metrics:
            bucket_stats = stats_function(host, bucket)
            # As we are changing metrics in the loop; take a copy of it to
            # iterate over.
            for metric in list(metrics):
                stats = bucket_stats['op']['samples'].get(metric)
                if stats:
                    last_value = stats[-1]
                    if last_value:
                        logger.info('{} = {:,}'.format(metric, last_value))
                        link1_completeness = \
                            bucket_stats['op']['samples'].get(link1_compelteness_str)[-1]
                        link2_completeness = \
                            bucket_stats['op']['samples'].get(link2_compelteness_str)[-1]
                        if link1_completeness == 100 or \
                                link2_completeness == 100:
                            if link1_completeness == 100:
                                if completion_count == 0:
                                    link1_time = time.time()
                                    link2_items = \
                                        bucket_stats['op']['samples'].get(link2_items_str)[-1]
                                    completion_count = completion_count + 1
                            elif link2_completeness == 100:
                                if completion_count == 0:
                                    link1_time = time.time()
                                    link2_items = \
                                        bucket_stats['op']['samples'].get(link2_items_str)[-1]
                                    completion_count = completion_count + 1
                        continue
                    else:
                        logger.info('{} reached 0'.format(metric))
                        if completion_count == 0:
                            link1_time = time.time()
                            link2_items = \
                                bucket_stats['op']['samples'].get(link2_items_str)[-1]
                            completion_count = completion_count + 1
                    metrics.remove(metric)
            if metrics:
                time.sleep(self.POLLING_INTERVAL)
            if time.time() - start_time > self.TIMEOUT:
                raise Exception('Monitoring got stuck')

        return link1_time, link2_items

    def _wait_for_completeness(self, host, bucket, xdcr_link, stats_function):
        metrics = []
        metrics.append(xdcr_link)

        start_time = time.time()

        while metrics:
            bucket_stats = stats_function(host, bucket)

            for metric in metrics:
                stats = bucket_stats['op']['samples'].get(metric)
                if stats:
                    last_value = stats[0]
                    if last_value != 100:
                        logger.info('{} : {}'.format(metric, last_value))
                    elif last_value == 100:
                        logger.info('{} Completed 100 %'.format(metric))
                        metrics.remove(metric)
            if metrics:
                time.sleep(self.POLLING_INTERVAL)
            if time.time() - start_time > self.TIMEOUT:
                raise Exception('Monitoring got stuck')

    def monitor_disk_queues(self, host, bucket):
        logger.info('Monitoring disk queues: {}'.format(bucket))
        self._wait_for_empty_queues(host, bucket, self.DISK_QUEUES,
                                    self.get_bucket_stats)

    def monitor_dcp_queues(self, host, bucket):
        logger.info('Monitoring DCP queues: {}'.format(bucket))
        self._wait_for_empty_queues(host, bucket, self.DCP_QUEUES,
                                    self.get_bucket_stats)

    def _wait_for_xdcr_to_start(self, host: str):
        is_running = False
        while not is_running:
            time.sleep(self.POLLING_INTERVAL)
            is_running, _ = self.get_task_status(host, task_type='xdcr')

    def xdcr_link_starttime(self, host: str, uuid: str):
        is_running = False
        while not is_running:
            time.sleep(self.POLLING_INTERVAL)
            is_running, _ = self.get_xdcrlink_status(host, task_type='xdcr', uuid=uuid)
        return time.time()

    def monitor_xdcr_queues(self, host: str, bucket: str):
        logger.info('Monitoring XDCR queues: {}'.format(bucket))
        self._wait_for_xdcr_to_start(host)
        self._wait_for_empty_queues(host, bucket, self.XDCR_QUEUES,
                                    self.get_xdcr_stats)

    def monitor_xdcr_changes_left(self, host: str, bucket: str, xdcrlink1: str, xdcrlink2: str):
        logger.info('Monitoring XDCR queues: {}'.format(bucket))
        self._wait_for_xdcr_to_start(host)
        start_time = time.time()
        link1_time, link2_items = self._wait_for_replication_completion(host, bucket,
                                                                        self.XDCR_QUEUES,
                                                                        self.get_xdcr_stats,
                                                                        xdcrlink1,
                                                                        xdcrlink2)
        return start_time, link1_time, link2_items

    def monitor_xdcr_completeness(self, host: str, bucket: str, xdcr_link: str):
        logger.info('Monitoring XDCR Link Completeness: {}'.format(bucket))
        self._wait_for_completeness(host=host, bucket=bucket, xdcr_link=xdcr_link,
                                    stats_function=self.get_xdcr_stats)
        return time.time()

    def _get_num_items(self, host: str, bucket: str, total: bool = False) -> int:
        stats = self.get_bucket_stats(host=host, bucket=bucket)
        if total:
            curr_items = stats['op']['samples'].get('curr_items_tot')
        else:
            curr_items = stats['op']['samples'].get('curr_items')
        if curr_items:
            return curr_items[-1]
        return 0

    def monitor_num_items(self, host: str, bucket: str, num_items: int):
        logger.info('Checking the number of items in {}'.format(bucket))
        retries = 0
        while retries < self.MAX_RETRY:
            curr_items = self._get_num_items(host, bucket, total=True)
            if curr_items == num_items:
                break
            else:
                logger.info('{}(curr_items) != {}(num_items)'.format(curr_items, num_items))
            time.sleep(self.POLLING_INTERVAL)
            retries += 1
        else:
            actual_items = self._get_num_items(host, bucket, total=True)
            raise Exception('Mismatch in the number of items: {}'
                            .format(actual_items))

    def monitor_task(self, host, task_type):
        logger.info('Monitoring task: {}'.format(task_type))
        time.sleep(self.MONITORING_DELAY)

        while True:
            time.sleep(self.POLLING_INTERVAL)

            tasks = [task for task in self.get_tasks(host)
                     if task.get('type') == task_type]
            if tasks:
                for task in tasks:
                    logger.info('{}: {}%, bucket: {}, ddoc: {}'.format(
                        task_type, task.get('progress'),
                        task.get('bucket'), task.get('designDocument')
                    ))
            else:
                break
        logger.info('Task {} successfully completed'.format(task_type))

    def monitor_warmup(self, memcached, host, bucket):
        logger.info('Monitoring warmup status: {}@{}'.format(bucket,
                                                             host))

        memcached_port = self.get_memcached_port(host)

        while True:
            stats = memcached.get_stats(host, memcached_port, bucket, 'warmup')
            if b'ep_warmup_state' in stats:
                state = stats[b'ep_warmup_state']
                if state == b'done':
                    return float(stats.get(b'ep_warmup_time', 0))
                else:
                    logger.info('Warmpup status: {}'.format(state))
                    time.sleep(self.POLLING_INTERVAL)
            else:
                    logger.info('No warmup stats are available, continue polling')
                    time.sleep(self.POLLING_INTERVAL)

    def monitor_compression(self, memcached, host, bucket):
        logger.info('Monitoring active compression status')

        memcached_port = self.get_memcached_port(host)

        json_docs = -1
        while json_docs:
            stats = memcached.get_stats(host, memcached_port, bucket)
            json_docs = int(stats[b'ep_active_datatype_json'])
            if json_docs:
                logger.info('Still uncompressed: {:,} items'.format(json_docs))
                time.sleep(self.POLLING_INTERVAL)
        logger.info('All items are compressed')

    def monitor_node_health(self, host):
        logger.info('Monitoring node health')

        for retry in range(self.MAX_RETRY):
            unhealthy_nodes = {
                n for n, status in self.node_statuses(host).items()
                if status != 'healthy'
            } | {
                n for n, status in self.node_statuses_v2(host).items()
                if status != 'healthy'
            }
            if unhealthy_nodes:
                time.sleep(self.POLLING_INTERVAL)
            else:
                break
        else:
            logger.interrupt('Some nodes are not healthy: {}'.format(
                unhealthy_nodes
            ))

    def monitor_analytics_node_active(self, host):
        logger.info('Monitoring analytics node health')

        for retry in range(self.MAX_RETRY):
            active = self.analytics_node_active(host)
            if active:
                break
            else:
                time.sleep(self.POLLING_INTERVAL)
        else:
            logger.interrupt('Analytics node still not healthy: {}'.format(
                host
            ))

    def is_index_ready(self, host: str) -> bool:
        for status in self.get_index_status(host)['status']:
            if status['status'] != 'Ready':
                return False
        return True

    def estimate_pending_docs(self, host: str) -> int:
        stats = self.get_gsi_stats(host)
        pending_docs = 0
        for metric, value in stats.items():
            if 'num_docs_queued' in metric or 'num_docs_pending' in metric:
                pending_docs += value
        return pending_docs

    def monitor_indexing(self, host):
        logger.info('Monitoring indexing progress')

        while not self.is_index_ready(host):
            time.sleep(self.POLLING_INTERVAL_INDEXING * 5)
            pending_docs = self.estimate_pending_docs(host)
            logger.info('Pending docs: {:,}'.format(pending_docs))

        logger.info('Indexing completed')

    def wait_for_secindex_init_build(self, host, indexes):
        # POLL until initial index build is complete
        logger.info(
            "Waiting for the following indexes to be ready: {}".format(indexes))

        indexes_ready = [0 for _ in indexes]

        def get_index_status(json2i, index):
            """Return the index status."""
            for d in json2i["status"]:
                if d["name"] == index:
                    return d["status"]
            return None

        @misc.retry(catch=(KeyError,), iterations=10, wait=30)
        def update_indexes_ready():
            json2i = self.get_index_status(host)
            for i, index in enumerate(indexes):
                status = get_index_status(json2i, index)
                if status == 'Ready':
                    indexes_ready[i] = 1

        init_ts = time.time()
        while sum(indexes_ready) != len(indexes):
            time.sleep(self.POLLING_INTERVAL_INDEXING)
            update_indexes_ready()
        finish_ts = time.time()
        logger.info('secondary index build time: {}'.format(finish_ts - init_ts))
        time_elapsed = round(finish_ts - init_ts)
        return time_elapsed

    def wait_for_secindex_incr_build(self, index_nodes, bucket, indexes, numitems):
        # POLL until incremenal index build is complete
        logger.info('expecting {} num_docs_indexed for indexes {}'.format(numitems, indexes))

        # collect num_docs_indexed information globally from all index nodes
        def get_num_docs_indexed():
            data = self.get_index_stats(index_nodes)
            num_indexed = []
            for index in indexes:
                key = "" + bucket + ":" + index + ":num_docs_indexed"
                val = data[key]
                num_indexed.append(val)
            return num_indexed

        def get_num_docs_index_pending():
            data = self.get_index_stats(index_nodes)
            num_pending = []
            for index in indexes:
                key = "" + bucket + ":" + index + ":num_docs_pending"
                val1 = data[key]
                key = "" + bucket + ":" + index + ":num_docs_queued"
                val2 = data[key]
                val = int(val1) + int(val2)
                num_pending.append(val)
            return num_pending

        expected_num_pending = [0] * len(indexes)
        while True:
            time.sleep(self.POLLING_INTERVAL_INDEXING)
            curr_num_pending = get_num_docs_index_pending()
            if curr_num_pending == expected_num_pending:
                break
        curr_num_indexed = get_num_docs_indexed()
        logger.info("Number of Items indexed {}".format(curr_num_indexed))

    def wait_for_num_connections(self, index_node, expected_connections):
        curr_connections = self.get_index_num_connections(index_node)
        retry = 1
        while curr_connections < expected_connections and retry < self.MAX_RETRY:
            time.sleep(self.POLLING_INTERVAL_INDEXING)
            curr_connections = self.get_index_num_connections(index_node)
            logger.info("Got current connections {}".format(curr_connections))
            retry += 1
        if retry == self.MAX_RETRY:
            return False
        return True

    def wait_for_recovery(self, index_nodes, bucket, index):
        time.sleep(self.MONITORING_DELAY)
        for retry in range(self.MAX_RETRY_RECOVERY):
            response = self.get_index_stats(index_nodes)
            item = "{}:{}:disk_load_duration".format(bucket, index)
            if item in response:
                return response[item]
            else:
                time.sleep(self.POLLING_INTERVAL)
        return -1

    def wait_for_servers(self):
        for retry in range(self.MAX_RETRY):
            logger.info('Waiting for all servers to be available')
            time.sleep(self.POLLING_INTERVAL_MACHINE_UP)

            for server in self.cluster_spec.servers:
                if not self.remote.is_up(server):
                    break
            else:
                logger.info('All nodes are up')
                return

        logger.interrupt('Some nodes are still down')

    def monitor_fts_indexing_queue(self, host: str, index: str, items: int):
        logger.info('Waiting for indexing to finish')
        count = 0
        while count < items:
            count = self.get_fts_doc_count(host, index)
            logger.info('FTS indexed documents: {:,}'.format(count))
            time.sleep(self.POLLING_INTERVAL)

    def monitor_fts_index_persistence(self, hosts: list, index: str, bkt: str = None):
        logger.info('Waiting for index to be persisted')
        if not bkt:
            bkt = self.test_config.buckets[0]
        pending_items = 1
        while pending_items:
            persist = 0
            compact = 0
            for host in hosts:
                stats = self.get_fts_stats(host)

                metric = '{}:{}:{}'.format(bkt, index, 'num_recs_to_persist')
                persist += stats[metric]

                metric = '{}:{}:{}'.format(bkt, index, 'total_compactions')
                compact += stats[metric]

            pending_items = persist or compact
            logger.info('Records to persist: {:,}'.format(persist))
            logger.info('Ongoing compactions: {:,}'.format(compact))
            time.sleep(self.POLLING_INTERVAL)

    def monitor_elastic_indexing_queue(self, host: str, index: str):
        logger.info(' Waiting for indexing to finish')
        items = int(self.test_config.fts_settings.test_total_docs)
        count = 0
        while count < items:
            count = self.get_elastic_doc_count(host, index)
            logger.info('Elasticsearch indexed documents: {:,}'.format(count))
            time.sleep(self.POLLING_INTERVAL)

    def monitor_elastic_index_persistence(self, host: str, index: str):
        logger.info('Waiting for index to be persisted')

        pending_items = -1
        while pending_items:
            stats = self.get_elastic_stats(host)
            pending_items = stats['indices'][index]['total']['translog']['operations']
            logger.info('Records to persist: {:,}'.format(pending_items))
            time.sleep(self.POLLING_INTERVAL)

    def wait_for_bootstrap(self, nodes: list, function: str):
        logger.info('Waiting for bootstrap of eventing function: {} '.format(function))
        for node in nodes:
            retry = 1
            while retry < self.MAX_RETRY_BOOTSTRAP:
                if function in self.get_apps_with_status(node, "deployed"):
                    break
                time.sleep(self.POLLING_INTERVAL)
                retry += 1
            if retry == self.MAX_RETRY_BOOTSTRAP:
                logger.info('Failed to bootstrap function: {}, node: {}'.
                            format(function, node))

    def get_num_analytics_items(self, data_node: str, bucket: str) -> int:
        stats_key = '{}:all:incoming_records_count_total'.format(bucket)
        num_items = 0
        for node in self.get_active_nodes_by_role(data_node, 'cbas'):
            stats = self.get_analytics_stats(node)
            num_items += stats[stats_key]
        return num_items

    def monitor_data_synced(self, data_node: str, bucket: str) -> int:
        logger.info('Waiting for data to be synced from {}'.format(data_node))

        num_items = self._get_num_items(data_node, bucket)

        while True:
            num_analytics_items = self.get_num_analytics_items(data_node,
                                                               bucket)
            if num_analytics_items == num_items:
                break
            logger.info('Analytics has {:,} docs (target is {:,})'.format(
                num_analytics_items, num_items))
            time.sleep(self.POLLING_INTERVAL_ANALYTICS)

        return num_items

    def wait_for_timer_event(self, node: str, function: str, event="timer_events"):
        logger.info('Waiting for timer events to start processing: {} '.format(function))
        retry = 1
        while retry < self.MAX_RETRY_TIMER_EVENT:
            if 0 < self.get_num_events_processed(
                    event=event, node=node, name=function):
                break
            time.sleep(self.POLLING_INTERVAL_EVENTING)
            retry += 1
        if retry == self.MAX_RETRY_TIMER_EVENT:
            logger.info('Failed to get timer event for function: {}'.format(function))

    def wait_for_all_mutations_processed(self, host: str, bucket1: str, bucket2: str):
        logger.info('Waiting for mutations to be processed of eventing function')
        retry = 1
        while retry < self.MAX_RETRY_BOOTSTRAP:
            if self._get_num_items(host=host, bucket=bucket1) == \
                    self._get_num_items(host=host, bucket=bucket2):
                break
            retry += 1
            time.sleep(self.POLLING_INTERVAL_EVENTING)
        if retry == self.MAX_RETRY_BOOTSTRAP:
            logger.info('Failed to process all mutations... TIMEOUT')

    def wait_for_all_timer_creation(self, node: str, function: str):
        logger.info('Waiting for all timers to be created by : {} '.format(function))
        retry = 1
        events_processed = {}
        while retry < self.MAX_RETRY_TIMER_EVENT:
            events_processed = self.get_num_events_processed(event="ALL",
                                                             node=node, name=function)
            if events_processed["dcp_mutation"] == events_processed["timer_responses_received"]:
                break
            time.sleep(self.POLLING_INTERVAL_EVENTING)
            retry += 1
        if retry == self.MAX_RETRY_TIMER_EVENT:
            logger.info('Got only {} timers created for function: {}'.format(
                events_processed["timer_responses_received"], function))

    def wait_for_function_undeploy(self, node: str, function: str):
        logger.info('Waiting for {} function to undeploy'.format(function))
        retry = 1
        while retry < self.MAX_RETRY_TIMER_EVENT:
            op = self.get_apps_with_status(node, "undeployed")
            if function in op:
                break
            time.sleep(self.POLLING_INTERVAL_EVENTING)
            retry += 1
        if retry == self.MAX_RETRY_TIMER_EVENT:
            logger.info('Function {} failed to undeploy...!!!'.format(function))
Example #34
0
 def __init__(self, cluster_spec, test_config, verbose):
     super().__init__(cluster_spec=cluster_spec)
     self.cluster_spec = cluster_spec
     self.test_config = test_config
     self.remote = RemoteHelper(cluster_spec, verbose)
Example #35
0
class GatewayInstaller(object):

    CBFS = 'http://cbfs-ext.hq.couchbase.com/builds/'

    def __init__(self, cluster_spec, test_config, options):
        self.remote_helper = RemoteHelper(cluster_spec)
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.pkg = self.remote_helper.detect_pkg()
        self.version = options.version

    def find_package(self):
        filename = 'couchbase-sync-gateway_{}_x86_64.rpm'.format(self.version)
        url = '{}{}'.format(self.CBFS, filename)
        try:
            status_code = requests.head(url).status_code
        except requests.exceptions.ConnectionError:
            pass
        else:
            if status_code == 200:
                logger.info('Found "{}"'.format(url))
                return filename, url
        logger.interrupt('Target build not found - {}'.format(url))

    def kill_processes_gw(self):
        self.remote_helper.kill_processes_gw()

    def kill_processes_gl(self):
        self.remote_helper.kill_processes_gl()

    def uninstall_package_gw(self):
        filename, url = self.find_package()
        self.remote_helper.uninstall_package_gw(self.pkg, filename)

    def uninstall_package_gl(self):
        self.remote_helper.uninstall_package_gl()

    def install_package_gw(self):
        filename, url = self.find_package()
        self.remote_helper.install_package_gw(self.pkg, url, filename,
                                              self.version)

    def install_package_gl(self):
        self.remote_helper.install_package_gl()

    def start_sync_gateways(self):
        with open('templates/gateway_config_template.json') as fh:
            template = json.load(fh)

        db_master = self.cluster_spec.yield_masters().next()
        template['databases']['db']['server'] = "http://*****:*****@{}/".format(db_master)
        template['maxIncomingConnections'] = self.test_config.gateway_settings.conn_in
        template['maxCouchbaseConnections'] = self.test_config.gateway_settings.conn_db
        template['CompressResponses'] = self.test_config.gateway_settings.compression

        with open('templates/gateway_config.json', 'w') as fh:
            fh.write(pretty_dict(template))
        self.remote_helper.start_sync_gateway()

    def install(self):
        num_gateways = len(self.cluster_spec.gateways)
        num_gateloads = len(self.cluster_spec.gateloads)
        if num_gateways != num_gateloads:
            logger.interrupt(
                'The cluster config file has different number of gateways({}) and gateloads({})'
                .format(num_gateways, num_gateloads)
            )
        self.kill_processes_gw()
        self.uninstall_package_gw()
        self.install_package_gw()
        self.start_sync_gateways()
        self.kill_processes_gl()
        self.uninstall_package_gl()
        self.install_package_gl()
Example #36
0
 def __init__(self, cluster_spec, options):
     self.remote = RemoteHelper(cluster_spec, options.verbose)
     self.options = options
     self.cluster_spec = cluster_spec
Example #37
0
class CouchbaseInstaller(object):

    CBFS = 'http://cbfs-ext.hq.couchbase.com/builds/'
    LATEST_BUILDS = 'http://latestbuilds.hq.couchbase.com/'
    SHERLOCK_BUILDS = ''
    WATSON_BUILDS = ''

    def __init__(self, cluster_spec, options):
        self.options = options
        self.remote = RemoteHelper(cluster_spec, None, options.verbose)
        self.cluster_spec = cluster_spec

        arch = self.remote.detect_arch()
        pkg = self.remote.detect_pkg()

        release = None
        build = None
        if options.version:
            release, build = options.version.split('-')
            self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/sherlock/{}/'.format(build)
            self.WATSON_BUILDS = 'http://172.23.120.24/builds/latestbuilds/couchbase-server/watson/{}/'.format(build)
            if options.toy:
                self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/toy-{}/{}/'.format(options.toy, build)
        self.build = Build(arch, pkg, options.cluster_edition, options.version,
                           release, build, options.toy, options.url)
        logger.info('Target build info: {}'.format(self.build))

    def get_expected_filenames(self):
        patterns = ()  # Sentinel
        if self.build.toy:
            patterns = (
                'couchbase-server-community_toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_toy-{toy}-{version}-toy_{arch}.{pkg}',
                'couchbase-server-community_cent58-2.5.2-toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_cent58-3.0.0-toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_ubunt12-3.0.0-toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_cent64-3.0.0-toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_cent64-3.0.1-toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_cent58-master-toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_cent54-master-toy-{toy}-{arch}_{version}-toy.{pkg}',
                # For toy builds >= Sherlock
                'couchbase-server-{edition}-{version}-centos6.{arch}.{pkg}',
                'couchbase-server-{edition}_{version}-ubuntu12.04_amd64.{pkg}',
            )
        elif self.build.pkg == 'rpm':
            patterns = (
                'couchbase-server-{edition}_centos6_{arch}_{version}-rel.{pkg}',
                'couchbase-server-{edition}-{version}-centos6.{arch}.{pkg}',
                'couchbase-server-{edition}_{arch}_{version}-rel.{pkg}',
                'couchbase-server-{edition}-{version}.{arch}.{pkg}',
            )
        elif self.build.pkg == 'deb':
            patterns = (
                'couchbase-server-{edition}_ubuntu_1204_{arch}_{version}-rel.{pkg}',
                'couchbase-server-{edition}_{version}-ubuntu12.04_amd64.{pkg}',
                'couchbase-server-{edition}_{arch}_{version}-rel.{pkg}',
                'couchbase-server-{edition}_{version}_amd64.{pkg}',
            )
        elif self.build.pkg == 'exe':
            patterns = (
                'couchbase-server-{edition}_{arch}_{version}-rel.setup.{pkg}',
                'couchbase_server-{edition}-windows-amd64-{version}.{pkg}',
                'couchbase-server-{edition}_{version}-windows_amd64.{pkg}',
                'couchbase_server/{release}/{build}/couchbase_server-{edition}-windows-amd64-{version}.exe',
                'couchbase-server-{edition}_{version}-windows_amd64.{pkg}',
            )

        for pattern in patterns:
            yield pattern.format(**self.build._asdict())

    def find_package(self):
        for filename in self.get_expected_filenames():
            for base in (self.LATEST_BUILDS, self.SHERLOCK_BUILDS, self.WATSON_BUILDS, self.CBFS):
                url = '{}{}'.format(base, filename)
                try:
                    status_code = requests.head(url).status_code
                except ConnectionError:
                    continue
                else:
                    if status_code == 200:
                        logger.info('Found "{}"'.format(url))
                        return filename, url
        logger.interrupt('Target build not found')

    def kill_processes(self):
        self.remote.kill_processes()

    def uninstall_package(self):
        self.remote.uninstall_couchbase(self.build.pkg)

    def clean_data(self):
        self.remote.clean_data()

    def install_package(self):
        if not self.options.url:
            filename, url = self.find_package()
        else:
            url = self.options.url
            logger.info("Using this URL to install instead of searching amongst"
                        " the known locations: {}".format(url))
            # obtain the filename after the last '/' of a url.
            filename = urlparse(url).path.split('/')[-1]

        self.remote.install_couchbase(self.build.pkg, url, filename,
                                      self.build.release)

    def install(self):
        self.kill_processes()
        self.uninstall_package()
        self.clean_data()
        self.install_package()
Example #38
0
class ClusterManager(object):
    def __init__(self, cluster_spec, test_config, verbose):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec, test_config, verbose)
        self.monitor = Monitor(cluster_spec)
        self.memcached = MemcachedHelper(test_config)

        self.clusters = cluster_spec.yield_clusters
        self.servers = cluster_spec.yield_servers
        self.masters = cluster_spec.yield_masters

        self.initial_nodes = test_config.cluster.initial_nodes
        self.mem_quota = test_config.cluster.mem_quota
        self.index_mem_quota = test_config.cluster.index_mem_quota
        self.group_number = test_config.cluster.group_number or 1
        self.roles = cluster_spec.roles

    def set_data_path(self):
        if self.cluster_spec.paths:
            data_path, index_path = self.cluster_spec.paths
            for server in self.servers():
                self.rest.set_data_path(server, data_path, index_path)

    def set_auth(self):
        for server in self.servers():
            self.rest.set_auth(server)

    def set_mem_quota(self):
        for server in self.servers():
            self.rest.set_mem_quota(server, self.mem_quota)

    def set_index_mem_quota(self):
        for server in self.servers():
            self.rest.set_index_mem_quota(server, self.index_mem_quota)

    def set_query_settings(self):
        settings = self.test_config.n1ql_settings.settings
        for _, servers in self.cluster_spec.yield_servers_by_role('n1ql'):
            for server in servers:
                self.rest.set_query_settings(server, settings)

    def set_index_settings(self):
        settings = self.test_config.secondaryindex_settings.settings
        for _, servers in self.cluster_spec.yield_servers_by_role('index'):
            for server in servers:
                self.rest.set_index_settings(server, settings)
        self.remote.restart()
        time.sleep(60)

    def set_services(self):
        for (_, servers), initial_nodes in zip(self.clusters(),
                                               self.initial_nodes):
            master = servers[0]
            self.rest.set_services(master, self.roles[master])

    def disable_moxi(self):
        if self.test_config.cluster.disable_moxi is not None:
            self.remote.disable_moxi()

    def create_server_groups(self):
        for master in self.masters():
            for i in range(1, self.group_number):
                name = 'Group {}'.format(i + 1)
                self.rest.create_server_group(master, name=name)

    def add_nodes(self):
        for (_, servers), initial_nodes in zip(self.clusters(),
                                               self.initial_nodes):

            if initial_nodes < 2:  # Single-node cluster
                continue

            # Adding initial nodes
            master = servers[0]
            if self.group_number > 1:
                groups = self.rest.get_server_groups(master)
            else:
                groups = {}
            for i, host_port in enumerate(servers[1:initial_nodes], start=1):
                uri = groups.get(
                    server_group(servers[:initial_nodes], self.group_number,
                                 i))
                self.rest.add_node(master, host_port, self.roles[host_port],
                                   uri)

            # Rebalance
            master = servers[0]
            known_nodes = servers[:initial_nodes]
            ejected_nodes = []
            self.rest.rebalance(master, known_nodes, ejected_nodes)
            self.monitor.monitor_rebalance(master)

    def create_buckets(self, emptyBuckets=False):
        ram_quota = self.mem_quota / (self.test_config.cluster.num_buckets +
                                      self.test_config.cluster.emptybuckets)
        replica_number = self.test_config.bucket.replica_number
        replica_index = self.test_config.bucket.replica_index
        eviction_policy = self.test_config.bucket.eviction_policy
        threads_number = self.test_config.bucket.threads_number
        proxyPort = self.test_config.bucket.proxyPort
        password = self.test_config.bucket.password
        buckets = self.test_config.emptybuckets if emptyBuckets else self.test_config.buckets

        for master in self.masters():
            for bucket_name in buckets:
                self.rest.create_bucket(host_port=master,
                                        name=bucket_name,
                                        ram_quota=ram_quota,
                                        replica_number=replica_number,
                                        replica_index=replica_index,
                                        eviction_policy=eviction_policy,
                                        threads_number=threads_number,
                                        password=password,
                                        proxyPort=proxyPort)

    def configure_auto_compaction(self):
        compaction_settings = self.test_config.compaction
        for master in self.masters():
            self.rest.configure_auto_compaction(master, compaction_settings)

    def configure_internal_settings(self):
        internal_settings = self.test_config.internal_settings
        for master in self.masters():
            for parameter, value in internal_settings.items():
                self.rest.set_internal_settings(master,
                                                {parameter: int(value)})

    def tweak_memory(self):
        self.remote.reset_swap()
        self.remote.drop_caches()
        self.remote.set_swappiness()
        self.remote.disable_thp()

    def restart_with_alternative_num_vbuckets(self):
        num_vbuckets = self.test_config.cluster.num_vbuckets
        if num_vbuckets is not None:
            self.remote.restart_with_alternative_num_vbuckets(num_vbuckets)

    def restart_with_alternative_bucket_options(self):
        cmd = 'ns_bucket:update_bucket_props("{}", ' \
              '[{{extra_config_string, "{}={}"}}]).'

        for option in ('defragmenter_enabled', 'exp_pager_stime', 'ht_locks',
                       'max_num_shards', 'max_threads',
                       'warmup_min_memory_threshold', 'bfilter_enabled'):
            value = getattr(self.test_config.bucket, option)
            if value != -1 and value is not None:
                logger.info('Changing {} to {}'.format(option, value))
                for master in self.masters():
                    for bucket in self.test_config.buckets:
                        diag_eval = cmd.format(bucket, option, value)
                        self.rest.run_diag_eval(master, diag_eval)
                self.remote.restart()

    def tune_logging(self):
        self.remote.tune_log_rotation()
        self.remote.restart()

    def restart_with_alternative_num_cpus(self):
        num_cpus = self.test_config.cluster.num_cpus
        if num_cpus:
            self.remote.restart_with_alternative_num_cpus(num_cpus)

    def restart_with_tcmalloc_aggressive_decommit(self):
        if self.test_config.cluster.tcmalloc_aggressive_decommit:
            self.remote.restart_with_tcmalloc_aggressive_decommit()

    def restart_with_sfwi(self):
        if self.test_config.cluster.sfwi:
            self.remote.restart_with_sfwi()

    def enable_auto_failover(self):
        for master in self.masters():
            self.rest.enable_auto_failover(master)

    def wait_until_warmed_up(self):
        target_iterator = TargetIterator(self.cluster_spec, self.test_config)
        for target in target_iterator:
            self.monitor.monitor_warmup(self.memcached, target.node,
                                        target.bucket)

    def wait_until_healthy(self):
        for master in self.cluster_spec.yield_masters():
            self.monitor.monitor_node_health(master)

    def change_watermarks(self):
        watermark_settings = self.test_config.watermark_settings
        for host_port, initial_nodes in zip(self.servers(),
                                            self.initial_nodes):
            host = host_port.split(':')[0]
            memcached_port = self.rest.get_memcached_port(host_port)
            for bucket in self.test_config.buckets:
                for key, val in watermark_settings.items():
                    val = self.memcached.calc_watermark(val, self.mem_quota)
                    self.memcached.set_flusher_param(host, memcached_port,
                                                     bucket, key, val)

    def start_cbq_engine(self):
        if self.test_config.cluster.run_cbq:
            self.remote.start_cbq()
Example #39
0
class RemoteWorkerManager:

    WORKER_HOME = '/tmp/perfrunner'

    PING_INTERVAL = 1

    def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig,
                 verbose: bool):
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.remote = RemoteHelper(cluster_spec, test_config, verbose)
        self.workers = cycle(self.cluster_spec.workers)
        self.terminate()
        self.start()
        self.wait_until_workers_are_ready()

    @property
    def is_remote(self) -> bool:
        return True

    def next_worker(self) -> str:
        return next(self.workers)

    def reset_workers(self):
        self.workers = cycle(self.cluster_spec.workers)

    def start(self):
        logger.info('Initializing remote worker environment')
        self.remote.init_repo(self.WORKER_HOME)

        for worker in self.cluster_spec.workers:
            logger.info(
                'Starting remote Celery worker, host={}'.format(worker))
            perfrunner_home = os.path.join(self.WORKER_HOME, 'perfrunner')
            self.remote.start_celery_worker(worker, perfrunner_home)

    def wait_until_workers_are_ready(self):
        workers = [
            'celery@{}'.format(worker) for worker in self.cluster_spec.workers
        ]
        while True:
            responses = celery.control.ping(workers)
            if len(responses) == len(workers):
                break
            time.sleep(self.PING_INTERVAL)
        logger.info('All remote Celery workers are ready')

    def run_tasks(self,
                  task: Callable,
                  task_settings: PhaseSettings,
                  target_iterator: TargetIterator,
                  timer: int = None):
        self.async_results = []
        self.reset_workers()
        for target in target_iterator:
            for instance in range(task_settings.worker_instances):
                worker = self.next_worker()
                logger.info('Running the task on {}'.format(worker))
                async_result = task.apply_async(
                    args=(task_settings, target, timer, instance),
                    queue=worker,
                    expires=timer,
                )
                self.async_results.append(async_result)

    def run_sg_tasks(self,
                     task: Callable,
                     task_settings: PhaseSettings,
                     timer: int = None,
                     distrubute: bool = False,
                     phase: str = ""):
        self.async_results = []
        self.reset_workers()

        if distrubute:
            total_threads = int(task_settings.syncgateway_settings.threads)
            total_clients = int(task_settings.syncgateway_settings.clients)
            instances_per_client = int(
                task_settings.syncgateway_settings.instances_per_client)
            total_instances = total_clients * instances_per_client
            threads_per_instance = int(total_threads / total_instances) or 1
            worker_id = 0
            for instance in range(instances_per_client):
                for client in self.cluster_spec.workers[:total_clients]:
                    worker_id += 1
                    logger.info(
                        'Running the \'{}\' by worker #{} on client {}'.format(
                            phase, worker_id, client))
                    task_settings.syncgateway_settings.threads_per_instance = str(
                        threads_per_instance)
                    async_result = task.apply_async(
                        args=(task_settings, timer, worker_id,
                              self.cluster_spec),
                        queue=client,
                        expires=timer,
                    )
                    self.async_results.append(async_result)
        else:
            client = self.cluster_spec.workers[0]
            logger.info(
                'Running sigle-instance task \'{}\' on client {}'.format(
                    phase, client))
            async_result = task.apply_async(
                args=(task_settings, timer, 0, self.cluster_spec),
                queue=client,
                expires=timer,
            )
            self.async_results.append(async_result)

    def wait_for_workers(self):
        logger.info('Waiting for all tasks to finish')
        for async_result in self.async_results:
            async_result.get()
        logger.info('All tasks are done')

    def download_celery_logs(self):
        if not os.path.exists('celery'):
            os.mkdir('celery')
        self.remote.get_celery_logs(self.WORKER_HOME)

    def terminate(self):
        logger.info('Terminating Celery workers')
        self.remote.terminate_client_processes()
Example #40
0
class RestoreHelper:

    def __init__(self, cluster_spec, test_config):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.snapshot = self.test_config.restore_settings.snapshot

        self.remote = RemoteHelper(self.cluster_spec)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def fetch_maps(self):
        rest = RestHelper(self.cluster_spec)
        master_node = next(self.cluster_spec.masters)

        maps = {}
        for bucket in self.test_config.buckets:
            vbmap = rest.get_vbmap(master_node, bucket)
            server_list = rest.get_server_list(master_node, bucket)
            maps[bucket] = (vbmap, server_list)

        return maps

    def cp(self, server, cmd):
        logger.info('Restoring files on {}'.format(server))

        with settings(host_string=server,
                      user=self.cluster_spec.ssh_credentials[0],
                      password=self.cluster_spec.ssh_credentials[1]):
            run(cmd)

    def restore(self):
        maps = self.fetch_maps()

        self.remote.stop_server()

        threads = []

        for bucket, (vbmap, server_list) in maps.items():
            files = defaultdict(list)

            for vb_idx, nodes in enumerate(vbmap):
                for node_idx in nodes:
                    files[server_list[node_idx]].append(vb_idx)

            for server, vbuckets in files.items():
                cmd = 'cp '
                for vbucket in vbuckets:
                    cmd += '{}/{}.couch.1 '.format(self.snapshot, vbucket)
                cmd += '/data/{}'.format(bucket)

                threads.append(Thread(target=self.cp, args=(server, cmd)))

        for t in threads:
            t.start()
            time.sleep(1)
        for t in threads:
            t.join()
        state.connections.clear()

        self.remote.drop_caches()
        self.remote.start_server()

    def warmup(self):
        cm = ClusterManager(self.cluster_spec, self.test_config)
        cm.wait_until_warmed_up()
        cm.wait_until_healthy()
Example #41
0
class PerfTest(object):

    COLLECTORS = {}

    def __init__(self, cluster_spec, test_config, experiment=None):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.target_iterator = TargetIterator(self.cluster_spec,
                                              self.test_config)

        self.memcached = MemcachedHelper(cluster_spec)
        self.monitor = Monitor(cluster_spec)
        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec)

        if experiment:
            self.experiment = ExperimentHelper(experiment,
                                               cluster_spec, test_config)

        self.master_node = cluster_spec.yield_masters().next()
        self.build = self.rest.get_version(self.master_node)

        self.cbagent = CbAgent(self)
        self.metric_helper = MetricHelper(self)
        self.reporter = Reporter(self)
        self.reports = {}
        self.snapshots = []
        self.master_events = []

        if self.test_config.test_case.use_workers:
            self.worker_manager = WorkerManager(cluster_spec, test_config)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.test_config.test_case.use_workers:
            self.worker_manager.terminate()
        if exc_type != exc.KeyboardInterrupt:
            self.debug()
        for master in self.cluster_spec.yield_masters():
            if not self.rest.is_balanced(master):
                logger.interrupt('Rebalance failed')

            num_failovers = self.rest.get_failover_counter(master)
            if hasattr(self, 'rebalance_settings'):
                if self.rebalance_settings.failover or \
                        self.rebalance_settings.graceful_failover:
                    continue
            if num_failovers:
                logger.interrupt(
                    'Failover happened {} time(s)'.format(num_failovers)
                )

    def compact_bucket(self):
        for master in self.cluster_spec.yield_masters():
            for bucket in self.test_config.buckets:
                self.rest.trigger_bucket_compaction(master, bucket)
        for master in self.cluster_spec.yield_masters():
            self.monitor.monitor_task(master, 'bucket_compaction')

    def wait_for_persistence(self):
        for master in self.cluster_spec.yield_masters():
            for bucket in self.test_config.buckets:
                self.monitor.monitor_disk_queue(master, bucket)
                self.monitor.monitor_tap_replication(master, bucket)

    def load(self):
        load_settings = self.test_config.load_settings
        log_phase('load phase', load_settings)
        self.worker_manager.run_workload(load_settings, self.target_iterator)
        self.worker_manager.wait_for_workers()

    def hot_load(self):
        hot_load_settings = self.test_config.hot_load_settings
        log_phase('hot load phase', hot_load_settings)
        self.worker_manager.run_workload(hot_load_settings,
                                         self.target_iterator)
        self.worker_manager.wait_for_workers()

    def access(self):
        access_settings = self.test_config.access_settings
        log_phase('access phase', access_settings)
        self.worker_manager.run_workload(access_settings, self.target_iterator)
        self.worker_manager.wait_for_workers()

    def access_bg(self):
        access_settings = self.test_config.access_settings
        log_phase('access in background', access_settings)
        self.worker_manager.run_workload(access_settings, self.target_iterator,
                                         timer=access_settings.time)

    def access_bg_with_ddocs(self):
        access_settings = self.test_config.access_settings
        log_phase('access phase', access_settings)
        index_type = self.test_config.index_settings.index_type
        self.worker_manager.run_workload(access_settings, self.target_iterator,
                                         timer=access_settings.time,
                                         ddocs=self.ddocs,
                                         index_type=index_type)

    def timer(self):
        access_settings = self.test_config.access_settings
        logger.info('Running phase for {} seconds'.format(access_settings.time))
        time.sleep(access_settings.time)

    def debug(self):
        self.remote.collect_info()
        for hostname in self.cluster_spec.yield_hostnames():
            for fname in glob.glob('{}/*.zip'.format(hostname)):
                shutil.move(fname, '{}.zip'.format(hostname))
        self.reporter.save_web_logs()
Example #42
0
 def __init__(self, cluster_spec: ClusterSpec):
     super().__init__(cluster_spec=cluster_spec)
     self.remote = RemoteHelper(cluster_spec)
     self.ip_table, self.port_translation = self.remote.get_ip_port_mapping(
     )
Example #43
0
    def __init__(self, cluster_spec, options):
        self.options = options
        self.cluster_spec = cluster_spec

        self.operator_version = self.options.operator_version
        if "-" in self.operator_version:
            self.operator_release = self.operator_version.split("-")[0]
            self.operator_tag = 'registry.gitlab.com/cb-vanilla/operator:{}'\
                .format(self.operator_version)
            self.admission_controller_release = self.operator_version.split("-")[0]
            self.admission_controller_tag = \
                'registry.gitlab.com/cb-vanilla/admission-controller:{}' \
                .format(self.operator_version)
        else:
            self.operator_release = self.operator_version
            self.operator_tag = 'couchbase/operator:{}'\
                .format(self.operator_version)
            self.admission_controller_release = self.operator_version
            self.admission_controller_tag = 'couchbase/admission-controller:{}' \
                .format(self.operator_version)

        self.couchbase_version = self.options.couchbase_version
        if "-" in self.couchbase_version:
            self.couchbase_release = self.couchbase_version.split("-")[0]
            self.couchbase_tag = 'registry.gitlab.com/cb-vanilla/server:{}'\
                .format(self.couchbase_version)
        else:
            self.couchbase_release = self.couchbase_version
            self.couchbase_tag = 'couchbase/server:{}'\
                .format(self.couchbase_version)

        self.operator_backup_version = self.options.operator_backup_version
        if self.operator_backup_version:
            if "-" in self.operator_backup_version:
                self.operator_backup_release = self.operator_backup_version.split("-")[0]
                self.operator_backup_tag = 'registry.gitlab.com/cb-vanilla/operator-backup:{}'\
                    .format(self.operator_backup_version)
            else:
                self.operator_backup_release = self.operator_backup_version
                self.operator_backup_tag = 'couchbase/operator-backup/{}'\
                    .format(self.operator_backup_version)
        else:
            self.operator_backup_tag = 'registry.gitlab.com/cb-vanilla/operator-backup:latest'

        self.node_count = len(self.cluster_spec.infrastructure_clusters['couchbase1'].split())

        self.remote = RemoteHelper(cluster_spec)

        self.docker_config_path = os.path.expanduser("~") + "/.docker/config.json"
        self.operator_base_path = "cloud/operator/{}/{}"\
            .format(self.operator_release.split(".")[0],
                    self.operator_release.split(".")[1])
        self.certificate_authority_path = "{}/ca.crt"\
            .format(self.operator_base_path)
        self.crd_path = "{}/crd.yaml"\
            .format(self.operator_base_path)
        self.config_path = "{}/config.yaml"\
            .format(self.operator_base_path)
        self.config_template_path = "{}/config_template.yaml"\
            .format(self.operator_base_path)
        self.auth_path = "{}/auth_secret.yaml"\
            .format(self.operator_base_path)
        self.cb_cluster_path = "{}/couchbase-cluster.yaml"\
            .format(self.operator_base_path)
        self.template_cb_cluster_path = "{}/couchbase-cluster_template.yaml"\
            .format(self.operator_base_path)
        self.worker_base_path = "cloud/worker"
        self.worker_path = "{}/worker.yaml"\
            .format(self.worker_base_path)
        self.rmq_base_path = "cloud/broker/rabbitmq/0.48"
        self.rmq_operator_path = "{}/cluster-operator.yaml"\
            .format(self.rmq_base_path)
        self.rmq_cluster_path = "{}/rabbitmq.yaml"\
            .format(self.rmq_base_path)
Example #44
0
class CouchbaseInstaller(object):

    CBFS = 'http://cbfs-ext.hq.couchbase.com/builds/'
    LATEST_BUILDS = 'http://latestbuilds.hq.couchbase.com/'
    SHERLOCK_BUILDS = ''

    def __init__(self, cluster_spec, options):
        self.remote = RemoteHelper(cluster_spec, None, options.verbose)
        self.cluster_spec = cluster_spec

        arch = self.remote.detect_arch()
        pkg = self.remote.detect_pkg()
        release, build = options.version.split('-')
        self.SHERLOCK_BUILDS = 'http://latestbuilds.hq.couchbase.com/couchbase-server/sherlock/{}/'.format(build)

        self.build = Build(arch, pkg, options.version, release, build,
                           options.toy)
        logger.info('Target build info: {}'.format(self.build))

    def get_expected_filenames(self):
        patterns = ()  # Sentinel
        if self.build.toy:
            patterns = (
                'couchbase-server-community_toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_toy-{toy}-{version}-toy_{arch}.{pkg}',
                'couchbase-server-community_cent58-2.5.2-toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_cent58-3.0.0-toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_cent64-3.0.0-toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_cent64-3.0.1-toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_cent58-master-toy-{toy}-{arch}_{version}-toy.{pkg}',
                'couchbase-server-community_cent54-master-toy-{toy}-{arch}_{version}-toy.{pkg}',
            )
        elif self.build.pkg == 'rpm':
            patterns = (
                'couchbase-server-enterprise_centos6_{arch}_{version}-rel.{pkg}',
                'couchbase-server-enterprise-{version}-centos6.{arch}.{pkg}',
                'couchbase-server-enterprise_{arch}_{version}-rel.{pkg}',
            )
        elif self.build.pkg == 'deb':
            patterns = (
                'couchbase-server-enterprise_ubuntu_1204_{arch}_{version}-rel.{pkg}',
                'couchbase-server-enterprise_{version}-ubuntu12.04_amd64.{pkg}',
                'couchbase-server-enterprise_{arch}_{version}-rel.{pkg}',
            )
        elif self.build.pkg == 'exe':
            patterns = (
                'couchbase-server-enterprise_{arch}_{version}-rel.setup.{pkg}',
                'couchbase_server-enterprise-windows-amd64-{version}.{pkg}',
                'couchbase_server/{release}/{build}/couchbase_server-enterprise-windows-amd64-{version}.exe',
            )

        for pattern in patterns:
            yield pattern.format(**self.build._asdict())

    def find_package(self):
        for filename in self.get_expected_filenames():
            for base in (self.LATEST_BUILDS, self.SHERLOCK_BUILDS, self.CBFS):
                url = '{}{}'.format(base, filename)
                try:
                    status_code = requests.head(url).status_code
                except ConnectionError:
                    continue
                else:
                    if status_code == 200:
                        logger.info('Found "{}"'.format(url))
                        return filename, url
        logger.interrupt('Target build not found')

    def kill_processes(self):
        self.remote.kill_processes()

    def uninstall_package(self):
        self.remote.uninstall_couchbase(self.build.pkg)

    def clean_data(self):
        self.remote.clean_data()

    def install_package(self):
        filename, url = self.find_package()
        self.remote.install_couchbase(self.build.pkg, url, filename,
                                      self.build.release)

    def install(self):
        self.kill_processes()
        self.uninstall_package()
        self.clean_data()
        self.install_package()
Example #45
0
 def __init__(self, cluster_spec, test_config, options):
     self.remote_helper = RemoteHelper(cluster_spec)
     self.cluster_spec = cluster_spec
     self.test_config = test_config
     self.pkg = self.remote_helper.detect_pkg()
     self.version = options.version
Example #46
0
class ClusterManager:
    def __init__(self,
                 cluster_spec: ClusterSpec,
                 test_config: TestConfig,
                 verbose: bool = False):
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.dynamic_infra = self.cluster_spec.dynamic_infrastructure
        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec, verbose)
        self.monitor = Monitor(cluster_spec, test_config, verbose)
        self.memcached = MemcachedHelper(test_config)
        self.master_node = next(self.cluster_spec.masters)

        self.initial_nodes = test_config.cluster.initial_nodes
        self.build = self.rest.get_version(self.master_node)

    def is_compatible(self, min_release: str) -> bool:
        for master in self.cluster_spec.masters:
            version = self.rest.get_version(master)
            return version >= min_release

    def set_data_path(self):
        if self.dynamic_infra:
            return
        for server in self.cluster_spec.servers:
            self.remote.change_owner(server, self.cluster_spec.data_path)
            self.rest.set_data_path(server, self.cluster_spec.data_path)

    def set_index_path(self):
        if self.dynamic_infra:
            return
        for server in self.cluster_spec.servers:
            self.remote.change_owner(server, self.cluster_spec.index_path)
            self.rest.set_index_path(server, self.cluster_spec.index_path)

    def set_analytics_path(self):
        if self.dynamic_infra:
            return
        paths = []
        for path in self.cluster_spec.analytics_paths:
            for i in range(self.test_config.analytics_settings.num_io_devices):
                io_device = '{}/dev{}'.format(path, i)
                paths.append(io_device)
        for server in self.cluster_spec.servers_by_role('cbas'):
            for path in self.cluster_spec.analytics_paths:
                self.remote.change_owner(server, path)
            self.rest.set_analytics_paths(server, paths)

    def rename(self):
        if self.dynamic_infra:
            return
        else:
            for server in self.cluster_spec.servers:
                self.rest.rename(server)

    def set_auth(self):
        if self.dynamic_infra:
            return
        else:
            for server in self.cluster_spec.servers:
                self.rest.set_auth(server)

    def set_mem_quotas(self):
        if self.dynamic_infra:
            cluster = self.remote.get_cluster()
            cluster['spec']['cluster']['dataServiceMemoryQuota'] = \
                '{}Mi'.format(self.test_config.cluster.mem_quota)
            cluster['spec']['cluster']['indexServiceMemoryQuota'] = \
                '{}Mi'.format(self.test_config.cluster.index_mem_quota)
            if self.test_config.cluster.fts_index_mem_quota:
                cluster['spec']['cluster']['searchServiceMemoryQuota'] = \
                    '{}Mi'.format(self.test_config.cluster.fts_index_mem_quota)
            if self.test_config.cluster.analytics_mem_quota:
                cluster['spec']['cluster']['analyticsServiceMemoryQuota'] = \
                    '{}Mi'.format(self.test_config.cluster.analytics_mem_quota)
            if self.test_config.cluster.eventing_mem_quota:
                cluster['spec']['cluster']['eventingServiceMemoryQuota'] = \
                    '{}Mi'.format(self.test_config.cluster.eventing_mem_quota)
            self.remote.update_cluster_config(cluster)
        else:
            for master in self.cluster_spec.masters:
                self.rest.set_mem_quota(master,
                                        self.test_config.cluster.mem_quota)
                self.rest.set_index_mem_quota(
                    master, self.test_config.cluster.index_mem_quota)
                if self.test_config.cluster.fts_index_mem_quota:
                    self.rest.set_fts_index_mem_quota(
                        master, self.test_config.cluster.fts_index_mem_quota)
                if self.test_config.cluster.analytics_mem_quota:
                    self.rest.set_analytics_mem_quota(
                        master, self.test_config.cluster.analytics_mem_quota)
                if self.test_config.cluster.eventing_mem_quota:
                    self.rest.set_eventing_mem_quota(
                        master, self.test_config.cluster.eventing_mem_quota)

    def set_query_settings(self):
        logger.info('Setting query settings')
        if self.dynamic_infra:
            return
        query_nodes = self.cluster_spec.servers_by_role('n1ql')
        if query_nodes:
            settings = self.test_config.n1ql_settings.cbq_settings
            if settings:
                self.rest.set_query_settings(query_nodes[0], settings)
            settings = self.rest.get_query_settings(query_nodes[0])
            settings = pretty_dict(settings)
            logger.info('Query settings: {}'.format(settings))

    def set_index_settings(self):
        logger.info('Setting index settings')
        index_nodes = self.cluster_spec.servers_by_role('index')
        if index_nodes:
            settings = self.test_config.gsi_settings.settings
            if settings:
                if self.dynamic_infra:
                    cluster = self.remote.get_cluster()
                    cluster['spec']['cluster']['indexStorageSetting'] = \
                        settings['indexer.settings.storage_mode']
                    self.remote.update_cluster_config(cluster,
                                                      timeout=300,
                                                      reboot=True)
                    logger.info('Index settings: {}'.format(settings))
                else:
                    self.rest.set_index_settings(index_nodes[0], settings)
                    settings = self.rest.get_index_settings(index_nodes[0])
                    settings = pretty_dict(settings)
                    logger.info('Index settings: {}'.format(settings))

    def set_services(self):
        if self.dynamic_infra:
            cluster = self.remote.get_cluster()
            server_types = dict()
            server_roles = self.cluster_spec.roles
            for server, role in server_roles.items():
                role = role\
                    .replace('kv', 'data')\
                    .replace('n1ql', 'query')
                server_type_count = server_types.get(role, 0)
                server_types[role] = server_type_count + 1

            istio = 'false'
            if self.cluster_spec.istio_enabled(cluster_name='k8s_cluster_1'):
                istio = 'true'

            cluster_servers = []
            operator_version = self.remote.get_operator_version()
            operator_major = int(operator_version.split(".")[0])
            operator_minor = int(operator_version.split(".")[1])
            for server_role, server_role_count in server_types.items():
                node_selector = {
                    '{}_enabled'.format(
                        service.replace('data', 'kv').replace('query',
                                                              'n1ql')): 'true'
                    for service in server_role.split(",")
                }
                node_selector['NodeRoles'] = 'couchbase1'
                spec = {
                    'imagePullSecrets': [{
                        'name': 'regcred'
                    }],
                    'nodeSelector': node_selector,
                }
                if (operator_major, operator_minor) <= (2, 1):
                    spec['containers'] = []
                pod_def =\
                    {
                        'spec': spec,
                        'metadata':
                            {
                                'annotations': {'sidecar.istio.io/inject': istio}
                            }
                    }
                server_def = \
                    {
                        'name': server_role.replace(",", "-"),
                        'services': server_role.split(","),
                        'pod': pod_def,
                        'size': server_role_count,
                        'volumeMounts': {'default': 'couchbase_kv'}
                    }
                cluster_servers.append(server_def)

            cluster['spec']['servers'] = cluster_servers
            self.remote.update_cluster_config(cluster,
                                              timeout=300,
                                              reboot=True)
        else:
            if not self.is_compatible(min_release='4.0.0'):
                return

            for master in self.cluster_spec.masters:
                roles = self.cluster_spec.roles[master]
                self.rest.set_services(master, roles)

    def add_nodes(self):
        if self.dynamic_infra:
            return
        for (_, servers), initial_nodes \
                in zip(self.cluster_spec.clusters, self.initial_nodes):

            if initial_nodes < 2:  # Single-node cluster
                continue

            master = servers[0]
            for node in servers[1:initial_nodes]:
                roles = self.cluster_spec.roles[node]
                self.rest.add_node(master, node, roles)

    def rebalance(self):
        if self.dynamic_infra:
            return
        for (_, servers), initial_nodes \
                in zip(self.cluster_spec.clusters, self.initial_nodes):
            master = servers[0]
            known_nodes = servers[:initial_nodes]
            ejected_nodes = []
            self.rest.rebalance(master, known_nodes, ejected_nodes)
            self.monitor.monitor_rebalance(master)
        self.wait_until_healthy()

    def increase_bucket_limit(self, num_buckets: int):
        if self.dynamic_infra:
            return
        for master in self.cluster_spec.masters:
            self.rest.increase_bucket_limit(master, num_buckets)

    def flush_buckets(self):
        for master in self.cluster_spec.masters:
            for bucket_name in self.test_config.buckets:
                self.rest.flush_bucket(host=master, bucket=bucket_name)

    def delete_buckets(self):
        for master in self.cluster_spec.masters:
            for bucket_name in self.test_config.buckets:
                self.rest.delete_bucket(host=master, name=bucket_name)

    def create_buckets(self):
        mem_quota = self.test_config.cluster.mem_quota
        if self.test_config.cluster.num_buckets > 7:
            self.increase_bucket_limit(self.test_config.cluster.num_buckets +
                                       3)

        if self.test_config.cluster.eventing_metadata_bucket_mem_quota:
            mem_quota -= (
                self.test_config.cluster.eventing_metadata_bucket_mem_quota +
                self.test_config.cluster.eventing_bucket_mem_quota)
        per_bucket_quota = mem_quota // self.test_config.cluster.num_buckets
        if self.dynamic_infra:
            self.remote.delete_all_buckets()
            for bucket_name in self.test_config.buckets:
                self.remote.create_bucket(bucket_name, per_bucket_quota,
                                          self.test_config.bucket)
        else:
            if self.test_config.bucket.backend_storage == 'magma':
                self.enable_developer_preview()
            for master in self.cluster_spec.masters:
                for bucket_name in self.test_config.buckets:
                    self.rest.create_bucket(
                        host=master,
                        name=bucket_name,
                        ram_quota=per_bucket_quota,
                        password=self.test_config.bucket.password,
                        replica_number=self.test_config.bucket.replica_number,
                        replica_index=self.test_config.bucket.replica_index,
                        eviction_policy=self.test_config.bucket.
                        eviction_policy,
                        bucket_type=self.test_config.bucket.bucket_type,
                        backend_storage=self.test_config.bucket.
                        backend_storage,
                        conflict_resolution_type=self.test_config.bucket.
                        conflict_resolution_type,
                        compression_mode=self.test_config.bucket.
                        compression_mode,
                    )

    def create_collections(self):
        if self.dynamic_infra:
            return
        collection_map = self.test_config.collection.collection_map
        for master in self.cluster_spec.masters:
            if collection_map is not None:
                if self.test_config.collection.use_bulk_api:
                    for bucket in collection_map.keys():
                        create_scopes = []
                        for scope in collection_map[bucket]:
                            scope_collections = []
                            for collection in collection_map[bucket][scope]:
                                scope_collections.append({"name": collection})
                            create_scopes.append({
                                "name":
                                scope,
                                "collections":
                                scope_collections
                            })
                        self.rest.set_collection_map(master, bucket,
                                                     {"scopes": create_scopes})
                else:
                    for bucket in collection_map.keys():
                        delete_default = True
                        for scope in collection_map[bucket]:
                            if scope == '_default':
                                for collection in collection_map[bucket][
                                        scope]:
                                    if collection == "_default":
                                        delete_default = False
                        if delete_default:
                            self.rest.delete_collection(
                                master, bucket, '_default', '_default')

                    for bucket in collection_map.keys():
                        for scope in collection_map[bucket]:
                            if scope != '_default':
                                self.rest.create_scope(master, bucket, scope)
                            for collection in collection_map[bucket][scope]:
                                if collection != '_default':
                                    self.rest.create_collection(
                                        master, bucket, scope, collection)

    def create_eventing_buckets(self):
        if not self.test_config.cluster.eventing_bucket_mem_quota:
            return
        if self.dynamic_infra:
            return
        per_bucket_quota = \
            self.test_config.cluster.eventing_bucket_mem_quota \
            // self.test_config.cluster.eventing_buckets

        for master in self.cluster_spec.masters:
            for bucket_name in self.test_config.eventing_buckets:
                self.rest.create_bucket(
                    host=master,
                    name=bucket_name,
                    ram_quota=per_bucket_quota,
                    password=self.test_config.bucket.password,
                    replica_number=self.test_config.bucket.replica_number,
                    replica_index=self.test_config.bucket.replica_index,
                    eviction_policy=self.test_config.bucket.eviction_policy,
                    bucket_type=self.test_config.bucket.bucket_type,
                    conflict_resolution_type=self.test_config.bucket.
                    conflict_resolution_type,
                )

    def create_eventing_metadata_bucket(self):
        if not self.test_config.cluster.eventing_metadata_bucket_mem_quota:
            return
        if self.dynamic_infra:
            return
        for master in self.cluster_spec.masters:
            self.rest.create_bucket(
                host=master,
                name=self.test_config.cluster.EVENTING_METADATA_BUCKET_NAME,
                ram_quota=self.test_config.cluster.
                eventing_metadata_bucket_mem_quota,
                password=self.test_config.bucket.password,
                replica_number=self.test_config.bucket.replica_number,
                replica_index=self.test_config.bucket.replica_index,
                eviction_policy=self.test_config.bucket.EVICTION_POLICY,
                bucket_type=self.test_config.bucket.BUCKET_TYPE,
            )

    def configure_auto_compaction(self):
        compaction_settings = self.test_config.compaction
        if self.dynamic_infra:
            cluster = self.remote.get_cluster()
            db = int(compaction_settings.db_percentage)
            view = int(compaction_settings.view_percentage)
            para = bool(str(compaction_settings.parallel).lower())

            auto_compaction = cluster['spec']['cluster']\
                .get('autoCompaction',
                     {'databaseFragmentationThreshold': {'percent': 30},
                      'viewFragmentationThreshold': {'percent': 30},
                      'parallelCompaction': False})

            db_percent = auto_compaction.get('databaseFragmentationThreshold',
                                             {'percent': 30})
            db_percent['percent'] = db
            auto_compaction['databaseFragmentationThreshold'] = db_percent

            views_percent = auto_compaction.get('viewFragmentationThreshold',
                                                {'percent': 30})
            views_percent['percent'] = view
            auto_compaction['viewFragmentationThreshold'] = views_percent
            auto_compaction['parallelCompaction'] = para

            self.remote.update_cluster_config(cluster)
        else:
            for master in self.cluster_spec.masters:
                self.rest.configure_auto_compaction(master,
                                                    compaction_settings)
                settings = self.rest.get_auto_compaction_settings(master)
                logger.info('Auto-compaction settings: {}'.format(
                    pretty_dict(settings)))

    def configure_internal_settings(self):
        internal_settings = self.test_config.internal_settings
        for master in self.cluster_spec.masters:
            for parameter, value in internal_settings.items():
                if self.dynamic_infra:
                    raise Exception(
                        'not supported for dynamic infrastructure yet')
                else:
                    self.rest.set_internal_settings(
                        master, {parameter: maybe_atoi(value)})

    def configure_xdcr_settings(self):
        xdcr_cluster_settings = self.test_config.xdcr_cluster_settings
        if self.dynamic_infra:
            return
        for master in self.cluster_spec.masters:
            for parameter, value in xdcr_cluster_settings.items():
                self.rest.set_xdcr_cluster_settings(
                    master, {parameter: maybe_atoi(value)})

    def tweak_memory(self):
        if self.dynamic_infra:
            return
        self.remote.reset_swap()
        self.remote.drop_caches()
        self.remote.set_swappiness()
        self.remote.disable_thp()

    def enable_n2n_encryption(self):
        if self.dynamic_infra:
            return
        if self.test_config.cluster.enable_n2n_encryption:
            for master in self.cluster_spec.masters:
                self.remote.enable_n2n_encryption(
                    master, self.test_config.cluster.enable_n2n_encryption)

    def restart_with_alternative_num_vbuckets(self):
        num_vbuckets = self.test_config.cluster.num_vbuckets
        if num_vbuckets is not None:
            if self.dynamic_infra:
                raise Exception('not supported for dynamic infrastructure yet')
            else:
                self.remote.restart_with_alternative_num_vbuckets(num_vbuckets)

    def restart_with_alternative_bucket_options(self):
        """Apply custom buckets settings.

        Tune bucket settings (e.g., max_num_shards or max_num_auxio) using
        "/diag/eval" and restart the entire cluster.
        """
        if self.dynamic_infra:
            return
        if self.test_config.bucket_extras:
            self.remote.enable_nonlocal_diag_eval()

        cmd = 'ns_bucket:update_bucket_props("{}", ' \
              '[{{extra_config_string, "{}={}"}}]).'

        for option, value in self.test_config.bucket_extras.items():
            if re.search("^num_.*_threads$", option):
                self.rest.set_num_threads(self.master_node, option, value)
            else:
                logger.info('Changing {} to {}'.format(option, value))
                for master in self.cluster_spec.masters:
                    for bucket in self.test_config.buckets:
                        diag_eval = cmd.format(bucket, option, value)
                        self.rest.run_diag_eval(master, diag_eval)

        if self.test_config.bucket_extras:
            self.disable_auto_failover()
            self.remote.restart()
            self.wait_until_healthy()
            self.enable_auto_failover()

    def tune_logging(self):
        if self.dynamic_infra:
            return
        self.remote.tune_log_rotation()
        self.remote.restart()

    def enable_auto_failover(self):
        enabled = self.test_config.bucket.autofailover_enabled
        failover_min = self.test_config.bucket.failover_min
        failover_max = self.test_config.bucket.failover_max
        if self.dynamic_infra:
            cluster = self.remote.get_cluster()
            cluster['spec']['cluster']['autoFailoverMaxCount'] = 1
            cluster['spec']['cluster']['autoFailoverServerGroup'] = bool(
                enabled)
            cluster['spec']['cluster']['autoFailoverOnDataDiskIssues'] = bool(
                enabled)
            cluster['spec']['cluster']['autoFailoverOnDataDiskIssuesTimePeriod'] = \
                '{}s'.format(10)
            cluster['spec']['cluster']['autoFailoverTimeout'] = \
                '{}s'.format(failover_max)
            self.remote.update_cluster_config(cluster)
        else:
            for master in self.cluster_spec.masters:
                self.rest.set_auto_failover(master, enabled, failover_min,
                                            failover_max)

    def disable_auto_failover(self):
        enabled = 'false'
        failover_min = self.test_config.bucket.failover_min
        failover_max = self.test_config.bucket.failover_max
        if self.dynamic_infra:
            cluster = self.remote.get_cluster()
            cluster['spec']['cluster']['autoFailoverMaxCount'] = 1
            cluster['spec']['cluster']['autoFailoverServerGroup'] = bool(
                enabled)
            cluster['spec']['cluster']['autoFailoverOnDataDiskIssues'] = bool(
                enabled)
            cluster['spec']['cluster']['autoFailoverOnDataDiskIssuesTimePeriod'] = \
                '{}s'.format(10)
            cluster['spec']['cluster']['autoFailoverTimeout'] = \
                '{}s'.format(failover_max)
            self.remote.update_cluster_config(cluster)
        else:
            for master in self.cluster_spec.masters:
                self.rest.set_auto_failover(master, enabled, failover_min,
                                            failover_max)

    def wait_until_warmed_up(self):
        if self.test_config.bucket.bucket_type in ('ephemeral', 'memcached'):
            return
        if self.dynamic_infra:
            self.remote.wait_for_cluster_ready()
        else:
            for master in self.cluster_spec.masters:
                for bucket in self.test_config.buckets:
                    self.monitor.monitor_warmup(self.memcached, master, bucket)

    def wait_until_healthy(self):
        if self.dynamic_infra:
            self.remote.wait_for_cluster_ready()
        else:
            for master in self.cluster_spec.masters:
                self.monitor.monitor_node_health(master)

                for analytics_node in self.rest.get_active_nodes_by_role(
                        master, 'cbas'):
                    self.monitor.monitor_analytics_node_active(analytics_node)

    def gen_disabled_audit_events(self, master: str) -> List[str]:
        curr_settings = self.rest.get_audit_settings(master)
        curr_disabled = {str(event) for event in curr_settings['disabled']}
        disabled = curr_disabled - self.test_config.audit_settings.extra_events
        return list(disabled)

    def enable_audit(self):
        if self.dynamic_infra:
            return
        if not self.is_compatible(min_release='4.0.0') or \
                self.rest.is_community(self.master_node):
            return

        if not self.test_config.audit_settings.enabled:
            return

        for master in self.cluster_spec.masters:
            disabled = []
            if self.test_config.audit_settings.extra_events:
                disabled = self.gen_disabled_audit_events(master)
            self.rest.enable_audit(master, disabled)

    def generate_ce_roles(self) -> List[str]:
        return ['admin']

    def generate_ee_roles(self) -> List[str]:
        existing_roles = {
            r['role']
            for r in self.rest.get_rbac_roles(self.master_node)
        }

        roles = []
        for role in (
                'bucket_admin',
                'data_dcp_reader',
                'data_monitoring',
                'data_reader_writer',
                'data_reader',
                'data_writer',
                'fts_admin',
                'fts_searcher',
                'query_delete',
                'query_insert',
                'query_select',
                'query_update',
                'views_admin',
        ):
            if role in existing_roles:
                roles.append(role + '[{bucket}]')

        return roles

    def delete_rbac_users(self):
        if not self.is_compatible(min_release='5.0'):
            return

        for master in self.cluster_spec.masters:
            for bucket in self.test_config.buckets:
                self.rest.delete_rbac_user(host=master, bucket=bucket)

    def add_rbac_users(self):
        if self.dynamic_infra:
            self.remote.create_from_file(
                "cloud/operator/2/1/user-password-secret.yaml")
            # self.remote.create_from_file("cloud/operator/2/1/admin-user.yaml")
            self.remote.create_from_file("cloud/operator/2/1/bucket-user.yaml")
            self.remote.create_from_file(
                "cloud/operator/2/1/rbac-admin-group.yaml")
            self.remote.create_from_file(
                "cloud/operator/2/1/rbac-admin-role-binding.yaml")
        else:
            if not self.rest.supports_rbac(self.master_node):
                logger.info('RBAC not supported - skipping adding RBAC users')
                return

            if self.rest.is_community(self.master_node):
                roles = self.generate_ce_roles()
            else:
                roles = self.generate_ee_roles()

            for master in self.cluster_spec.masters:
                admin_user, admin_password = self.cluster_spec.rest_credentials
                self.rest.add_rbac_user(
                    host=master,
                    user=admin_user,
                    password=admin_password,
                    roles=['admin'],
                )

                buckets = self.test_config.buckets + self.test_config.eventing_buckets

                for bucket in buckets:
                    bucket_roles = [
                        role.format(bucket=bucket) for role in roles
                    ]
                    bucket_roles.append("admin")
                    self.rest.add_rbac_user(
                        host=master,
                        user=bucket,  # Backward compatibility
                        password=self.test_config.bucket.password,
                        roles=bucket_roles,
                    )

    def add_extra_rbac_users(self, num_users):
        if not self.rest.supports_rbac(self.master_node):
            logger.info('RBAC not supported - skipping adding RBAC users')
            return

        if self.rest.is_community(self.master_node):
            roles = self.generate_ce_roles()
        else:
            roles = self.generate_ee_roles()

        for master in self.cluster_spec.masters:
            admin_user, admin_password = self.cluster_spec.rest_credentials
            self.rest.add_rbac_user(
                host=master,
                user=admin_user,
                password=admin_password,
                roles=['admin'],
            )

            for bucket in self.test_config.buckets:
                bucket_roles = [role.format(bucket=bucket) for role in roles]
                bucket_roles.append("admin")
                for i in range(1, num_users + 1):
                    user = '******'.format(user_number=str(i))
                    self.rest.add_rbac_user(
                        host=master,
                        user=user,
                        password=self.test_config.bucket.password,
                        roles=bucket_roles,
                    )

    def throttle_cpu(self):
        if self.dynamic_infra:
            cluster = self.remote.get_cluster()
            if self.test_config.cluster.enable_cpu_cores:
                server_groups = cluster['spec']['servers']
                updated_server_groups = []
                default_cpu = 80
                for server_group in server_groups:
                    resources = server_group.get('resources', {})
                    limits = resources.get('limits', {})
                    limits['cpu'] = default_cpu
                    resources['limits'] = limits
                    server_group['resources'] = resources
                    updated_server_groups.append(server_group)
                cluster['spec']['servers'] = updated_server_groups

            if self.test_config.cluster.online_cores:
                server_groups = cluster['spec']['servers']
                updated_server_groups = []
                online_vcpus = self.test_config.cluster.online_cores * 2
                for server_group in server_groups:
                    resources = server_group.get('resources', {})
                    limits = resources.get('limits', {})
                    limits['cpu'] = online_vcpus
                    resources['limits'] = limits
                    server_group['resources'] = resources
                    updated_server_groups.append(server_group)
                cluster['spec']['servers'] = updated_server_groups
            self.remote.update_cluster_config(cluster,
                                              timeout=300,
                                              reboot=True)
        else:
            if self.remote.os == 'Cygwin':
                return

            if self.test_config.cluster.enable_cpu_cores:
                self.remote.enable_cpu()

            if self.test_config.cluster.online_cores:
                self.remote.disable_cpu(self.test_config.cluster.online_cores)

    def tune_memory_settings(self):
        kernel_memory = self.test_config.cluster.kernel_mem_limit
        if kernel_memory:
            if self.dynamic_infra:
                cluster = self.remote.get_cluster()
                server_groups = cluster['spec']['servers']
                tune_services = set()
                # CAO uses different service names than perfrunner
                for service in self.test_config.cluster.kernel_mem_limit_services:
                    if service == 'kv':
                        service = 'data'
                    elif service == 'n1ql':
                        service = 'query'
                    elif service == 'fts':
                        service = 'search'
                    elif service == 'cbas':
                        service = 'analytics'
                    tune_services.add(service)

                updated_server_groups = []
                default_mem = '128Gi'
                for server_group in server_groups:
                    services_in_group = set(server_group['services'])
                    resources = server_group.get('resources', {})
                    limits = resources.get('limits', {})
                    mem_limit = limits.get('memory', default_mem)
                    if services_in_group.intersection(
                            tune_services) and kernel_memory != 0:
                        mem_limit = '{}Mi'.format(kernel_memory)
                    limits['memory'] = mem_limit
                    resources['limits'] = limits
                    server_group['resources'] = resources
                    updated_server_groups.append(server_group)

                cluster['spec']['servers'] = updated_server_groups
                self.remote.update_cluster_config(cluster,
                                                  timeout=300,
                                                  reboot=True)
            else:
                for service in self.test_config.cluster.kernel_mem_limit_services:
                    for server in self.cluster_spec.servers_by_role(service):
                        self.remote.tune_memory_settings(host_string=server,
                                                         size=kernel_memory)
                self.monitor.wait_for_servers()

    def reset_memory_settings(self):
        if self.dynamic_infra:
            return
        for service in self.test_config.cluster.kernel_mem_limit_services:
            for server in self.cluster_spec.servers_by_role(service):
                self.remote.reset_memory_settings(host_string=server)
        self.monitor.wait_for_servers()

    def flush_iptables(self):
        if self.dynamic_infra:
            return
        self.remote.flush_iptables()

    def clear_login_history(self):
        if self.dynamic_infra:
            return
        self.remote.clear_wtmp()

    def disable_wan(self):
        if self.dynamic_infra:
            return
        self.remote.disable_wan()

    def enable_ipv6(self):
        if self.dynamic_infra:
            return
        if self.test_config.cluster.ipv6:
            version, build_number = self.build.split('-')
            build = tuple(map(int, version.split('.'))) + (int(build_number), )

            if build < (6, 5, 0, 0):
                self.remote.update_ip_family_rest()
            else:
                self.remote.update_ip_family_cli()
            self.remote.enable_ipv6()

    def set_x509_certificates(self):
        if self.dynamic_infra:
            return
        logger.info('Setting x509 settings')
        if self.test_config.access_settings.ssl_mode == "auth":
            self.remote.setup_x509()
            for host in self.cluster_spec.servers:
                self.rest.upload_cluster_certificate(host)
            for host in self.cluster_spec.servers:
                self.rest.reload_cluster_certificate(host)
                self.rest.enable_certificate_auth(host)

    def set_cipher_suite(self):
        if self.dynamic_infra:
            return
        if self.test_config.access_settings.cipher_list:
            check_cipher_suit = self.rest.get_cipher_suite(self.master_node)
            logger.info('current cipher suit: {}'.format(check_cipher_suit))
            self.rest.set_cipher_suite(
                self.master_node, self.test_config.access_settings.cipher_list)
            check_cipher_suit = self.rest.get_cipher_suite(self.master_node)
            logger.info('new cipher suit: {}'.format(check_cipher_suit))

    def set_min_tls_version(self):
        if self.dynamic_infra:
            return
        if self.test_config.access_settings.min_tls_version:
            check_tls_version = self.rest.get_minimum_tls_version(
                self.master_node)
            logger.info('current tls version: {}'.format(check_tls_version))
            self.rest.set_minimum_tls_version(
                self.master_node,
                self.test_config.access_settings.min_tls_version)
            check_tls_version = self.rest.get_minimum_tls_version(
                self.master_node)
            logger.info('new tls version: {}'.format(check_tls_version))

    def get_debug_rpm_url(self):
        release, build_number = self.build.split('-')
        build = tuple(map(int, release.split('.'))) + (int(build_number), )
        if build > (7, 0, 0, 0):
            release = 'cheshire-cat'
        elif build > (6, 5, 0, 0) and build < (7, 0, 0, 0):
            release = 'mad-hatter'
        elif build < (6, 5, 0, 0):
            release = 'alice'
        centos_version = self.remote.detect_centos_release()

        rpm_url = 'http://latestbuilds.service.couchbase.com/builds/' \
                  'latestbuilds/couchbase-server/{}/{}/' \
                  'couchbase-server-enterprise-debuginfo-{}-centos{}.x86_64.rpm' \
                  ''.format(release, build_number, self.build, centos_version)
        return rpm_url

    def install_cb_debug_rpm(self):
        self.remote.install_cb_debug_rpm(url=self.get_debug_rpm_url())

    def enable_developer_preview(self):
        release, build_number = self.build.split('-')
        build = tuple(map(int, release.split('.'))) + (int(build_number), )
        if build > (7, 0, 0, 4698) or build < (0, 0, 0, 9999):
            self.remote.enable_developer_preview()
Example #47
0
class CouchbaseInstaller:
    def __init__(self, cluster_spec, options):
        self.remote = RemoteHelper(cluster_spec, options.verbose)
        self.options = options

    @property
    def url(self) -> str:
        if validators.url(self.options.version):
            return self.options.version
        else:
            return self.find_package(edition=self.options.edition)

    @property
    def release(self) -> str:
        return self.options.version.split('-')[0]

    @property
    def build(self) -> str:
        split = self.options.version.split('-')
        if len(split) > 1:
            return split[1]

    def find_package(self, edition: str) -> [str, str]:
        for url in self.url_iterator(edition):
            if self.is_exist(url):
                return url
        logger.interrupt('Target build not found')

    def url_iterator(self, edition: str) -> Iterator[str]:
        os_release = None
        if self.remote.package == 'rpm':
            os_release = self.remote.detect_centos_release()
        elif self.remote.package == 'deb':
            os_release = self.remote.detect_ubuntu_release()

        for pkg_pattern in PKG_PATTERNS[self.remote.package]:
            for loc_pattern in LOCATIONS:
                url = loc_pattern + pkg_pattern
                yield url.format(release=self.release,
                                 build=self.build,
                                 edition=edition,
                                 os=os_release)

    @staticmethod
    def is_exist(url):
        try:
            status_code = requests.head(url).status_code
        except ConnectionError:
            return False
        if status_code == 200:
            return True
        return False

    def download(self):
        """Download and save a copy of the specified package."""
        if self.remote.package == 'rpm':
            logger.info('Saving a local copy of {}'.format(self.url))
            with open('couchbase.rpm', 'wb') as fh:
                resp = requests.get(self.url)
                fh.write(resp.content)
        else:
            logger.interrupt('Unsupported package format')

    def kill_processes(self):
        self.remote.kill_processes()

    def uninstall_package(self):
        self.remote.uninstall_couchbase()

    def clean_data(self):
        self.remote.clean_data()

    def install_package(self):
        logger.info('Using this URL: {}'.format(self.url))
        self.remote.upload_iss_files(self.release)
        self.remote.install_couchbase(self.url)

    def install(self):
        self.kill_processes()
        self.uninstall_package()
        self.clean_data()
        self.install_package()
Example #48
0
class CouchbaseInstaller(object):

    def __init__(self, cluster_spec, options):
        self.options = options
        self.remote = RemoteHelper(cluster_spec, None, options.verbose)
        self.cluster_spec = cluster_spec

        arch = self.remote.detect_arch()
        pkg = self.remote.detect_pkg()

        release = None
        build = None
        if options.version:
            release, build = options.version.split('-')

        self.build = Build(arch, pkg, options.cluster_edition, options.version,
                           release, build, options.url)
        logger.info('Target build info: {}'.format(self.build))

    def get_expected_filenames(self):
        if self.build.pkg == 'rpm':
            os_release = self.remote.detect_os_release()
            patterns = (
                'couchbase-server-{{edition}}-{{version}}-centos{}.{{arch}}.{{pkg}}'.format(os_release),
                'couchbase-server-{edition}_centos6_{arch}_{version}-rel.{pkg}',
            )
        elif self.build.pkg == 'deb':
            patterns = (
                'couchbase-server-{edition}_{arch}_{version}-rel.{pkg}',
                'couchbase-server-{edition}_{version}-ubuntu12.04_{arch}.{pkg}',
            )
        elif self.build.pkg == 'exe':
            patterns = (
                'couchbase-server-{edition}_{arch}_{version}-rel.setup.{pkg}',
                'couchbase-server-{edition}_{version}-windows_{arch}.{pkg}',
                'couchbase_server-{edition}-windows-{arch}-{version}.{pkg}',
            )
        else:
            patterns = ()  # Sentinel

        for pattern in patterns:
            yield pattern.format(**self.build.__dict__)

    @staticmethod
    def is_exist(url):
        try:
            status_code = requests.head(url).status_code
        except ConnectionError:
            return False
        if status_code == 200:
            return True
        return False

    def find_package(self):
        for filename in self.get_expected_filenames():
            for location in LOCATIONS:
                url = '{}{}'.format(location.format(**self.build.__dict__),
                                    filename)
                if self.is_exist(url):
                    return filename, url
        logger.interrupt('Target build not found')

    def kill_processes(self):
        self.remote.kill_processes()

    def uninstall_package(self):
        self.remote.uninstall_couchbase(self.build.pkg)

    def clean_data(self):
        self.remote.clean_data()

    def install_package(self):
        if self.options.version:
            filename, url = self.find_package()
        else:
            url = self.options.url
            filename = urlparse(url).path.split('/')[-1]

        logger.info('Using this URL: {}'.format(url))
        self.remote.install_couchbase(self.build.pkg, url, filename,
                                      self.build.release)

    def install(self):
        self.kill_processes()
        self.uninstall_package()
        self.clean_data()
        self.install_package()
Example #49
0
class SGPerfTest(PerfTest):

    COLLECTORS = {
        'disk': False,
        'ns_server': False,
        'ns_server_overview': False,
        'active_tasks': False,
        'syncgateway_stats': True
    }

    ALL_HOSTNAMES = True
    LOCAL_DIR = "YCSB"

    def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig,
                 verbose: bool):
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.memcached = MemcachedHelper(test_config)
        self.remote = RemoteHelper(cluster_spec, test_config, verbose)
        self.rest = RestHelper(cluster_spec)
        # self.build = os.environ.get('SGBUILD') or "0.0.0-000"
        self.master_node = next(cluster_spec.masters)
        self.build = self.rest.get_sgversion(self.master_node)
        self.metrics = MetricHelper(self)
        self.reporter = ShowFastReporter(cluster_spec, test_config, self.build)
        if self.test_config.test_case.use_workers:
            self.worker_manager = WorkerManager(cluster_spec, test_config,
                                                verbose)
        self.settings = self.test_config.access_settings
        self.settings.syncgateway_settings = self.test_config.syncgateway_settings
        self.profiler = Profiler(cluster_spec, test_config)
        self.cluster = ClusterManager(cluster_spec, test_config)
        self.target_iterator = TargetIterator(cluster_spec, test_config)
        self.monitor = Monitor(cluster_spec, test_config, verbose)

    def download_ycsb(self):
        if self.worker_manager.is_remote:
            self.remote.clone_ycsb(
                repo=self.test_config.syncgateway_settings.repo,
                branch=self.test_config.syncgateway_settings.branch,
                worker_home=self.worker_manager.WORKER_HOME,
                ycsb_instances=int(self.test_config.syncgateway_settings.
                                   instances_per_client))
        else:
            local.clone_ycsb(
                repo=self.test_config.syncgateway_settings.repo,
                branch=self.test_config.syncgateway_settings.branch)

    def collect_execution_logs(self):
        if self.worker_manager.is_remote:
            if os.path.exists(self.LOCAL_DIR):
                shutil.rmtree(self.LOCAL_DIR, ignore_errors=True)
            os.makedirs(self.LOCAL_DIR)
            self.remote.get_syncgateway_YCSB_logs(
                self.worker_manager.WORKER_HOME,
                self.test_config.syncgateway_settings, self.LOCAL_DIR)

    def run_sg_phase(self,
                     phase: str,
                     task: Callable,
                     settings: PhaseSettings,
                     timer: int = None,
                     distribute: bool = False) -> None:
        logger.info('Running {}: {}'.format(phase, pretty_dict(settings)))
        self.worker_manager.run_sg_tasks(task, settings, timer, distribute,
                                         phase)
        self.worker_manager.wait_for_workers()

    def start_memcached(self):
        self.run_sg_phase("start memcached", syncgateway_task_start_memcached,
                          self.settings, self.settings.time, False)

    def load_users(self):
        self.run_sg_phase("load users", syncgateway_task_load_users,
                          self.settings, self.settings.time, False)

    def init_users(self):
        if self.test_config.syncgateway_settings.auth == 'true':
            self.run_sg_phase("init users", syncgateway_task_init_users,
                              self.settings, self.settings.time, False)

    def grant_access(self):
        if self.test_config.syncgateway_settings.grant_access == 'true':
            self.run_sg_phase("grant access to  users",
                              syncgateway_task_grant_access, self.settings,
                              self.settings.time, False)

    def load_docs(self):
        self.run_sg_phase("load docs", syncgateway_task_load_docs,
                          self.settings, self.settings.time, False)

    @with_stats
    @with_profiles
    def run_test(self):
        self.run_sg_phase("run test", syncgateway_task_run_test, self.settings,
                          self.settings.time, True)

    def compress_sg_logs(self):
        self.remote.compress_sg_logs()

    def get_sg_logs(self):
        initial_nodes = int(self.test_config.syncgateway_settings.nodes)
        ssh_user, ssh_pass = self.cluster_spec.ssh_credentials
        for _server in range(initial_nodes):
            server = self.cluster_spec.servers[_server]
            local.get_sg_logs(host=server,
                              ssh_user=ssh_user,
                              ssh_pass=ssh_pass)

    def run(self):
        self.download_ycsb()
        self.start_memcached()
        self.load_users()
        self.load_docs()
        self.init_users()
        self.grant_access()
        self.run_test()
        self.report_kpi()

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.test_config.test_case.use_workers:
            self.worker_manager.download_celery_logs()
            self.worker_manager.terminate()

        if self.test_config.cluster.online_cores:
            self.remote.enable_cpu()

        if self.test_config.cluster.kernel_mem_limit:
            self.remote.reset_memory_settings()
            self.monitor.wait_for_servers()
Example #50
0
class ClusterManager:
    def __init__(self,
                 cluster_spec: ClusterSpec,
                 test_config: TestConfig,
                 verbose: bool = False):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec, verbose)
        self.monitor = Monitor(cluster_spec, test_config, verbose)
        self.memcached = MemcachedHelper(test_config)

        self.master_node = next(self.cluster_spec.masters)

        self.initial_nodes = test_config.cluster.initial_nodes

    def is_compatible(self, min_release: str) -> bool:
        for master in self.cluster_spec.masters:
            version = self.rest.get_version(master)
            return version >= min_release

    def set_data_path(self):
        for server in self.cluster_spec.servers:
            self.remote.change_owner(server, self.cluster_spec.data_path)
            self.rest.set_data_path(server, self.cluster_spec.data_path)

    def set_index_path(self):
        for server in self.cluster_spec.servers:
            self.remote.change_owner(server, self.cluster_spec.index_path)
            self.rest.set_index_path(server, self.cluster_spec.index_path)

    def set_analytics_path(self):
        paths = []
        for path in self.cluster_spec.analytics_paths:
            for i in range(self.test_config.analytics_settings.num_io_devices):
                io_device = '{}/dev{}'.format(path, i)
                paths.append(io_device)
        for server in self.cluster_spec.servers_by_role('cbas'):
            for path in self.cluster_spec.analytics_paths:
                self.remote.change_owner(server, path)
            self.rest.set_analytics_paths(server, paths)

    def rename(self):
        for server in self.cluster_spec.servers:
            self.rest.rename(server)

    def set_auth(self):
        for server in self.cluster_spec.servers:
            self.rest.set_auth(server)

    def set_mem_quotas(self):
        for master in self.cluster_spec.masters:
            self.rest.set_mem_quota(master, self.test_config.cluster.mem_quota)
            self.rest.set_index_mem_quota(
                master, self.test_config.cluster.index_mem_quota)
            if self.test_config.cluster.fts_index_mem_quota:
                self.rest.set_fts_index_mem_quota(
                    master, self.test_config.cluster.fts_index_mem_quota)
            if self.test_config.cluster.analytics_mem_quota:
                self.rest.set_analytics_mem_quota(
                    master, self.test_config.cluster.analytics_mem_quota)
            if self.test_config.cluster.eventing_mem_quota:
                self.rest.set_eventing_mem_quota(
                    master, self.test_config.cluster.eventing_mem_quota)

    def set_query_settings(self):
        logger.info('Setting query settings')
        query_nodes = self.cluster_spec.servers_by_role('n1ql')
        if query_nodes:
            settings = self.test_config.n1ql_settings.cbq_settings
            if settings:
                self.rest.set_query_settings(query_nodes[0], settings)
            settings = self.rest.get_query_settings(query_nodes[0])
            settings = pretty_dict(settings)
            logger.info('Query settings: {}'.format(settings))

    def set_index_settings(self):
        logger.info('Setting index settings')
        index_nodes = self.cluster_spec.servers_by_role('index')
        if index_nodes:
            settings = self.test_config.gsi_settings.settings
            if settings:
                self.rest.set_index_settings(index_nodes[0], settings)

            settings = self.rest.get_index_settings(index_nodes[0])
            settings = pretty_dict(settings)
            logger.info('Index settings: {}'.format(settings))

    def set_services(self):
        if not self.is_compatible(min_release='4.0.0'):
            return

        for master in self.cluster_spec.masters:
            roles = self.cluster_spec.roles[master]
            self.rest.set_services(master, roles)

    def add_nodes(self):
        for (_, servers), initial_nodes in zip(self.cluster_spec.clusters,
                                               self.initial_nodes):

            if initial_nodes < 2:  # Single-node cluster
                continue

            master = servers[0]
            for node in servers[1:initial_nodes]:
                roles = self.cluster_spec.roles[node]
                self.rest.add_node(master, node, roles)

    def rebalance(self):
        for (_, servers), initial_nodes in zip(self.cluster_spec.clusters,
                                               self.initial_nodes):
            master = servers[0]
            known_nodes = servers[:initial_nodes]
            ejected_nodes = []
            self.rest.rebalance(master, known_nodes, ejected_nodes)
            self.monitor.monitor_rebalance(master)
        self.wait_until_healthy()

    def increase_bucket_limit(self, num_buckets: int):
        for master in self.cluster_spec.masters:
            self.rest.increase_bucket_limit(master, num_buckets)

    def flush_buckets(self):
        for master in self.cluster_spec.masters:
            for bucket_name in self.test_config.buckets:
                self.rest.flush_bucket(host=master, bucket=bucket_name)

    def delete_buckets(self):
        for master in self.cluster_spec.masters:
            for bucket_name in self.test_config.buckets:
                self.rest.delete_bucket(host=master, name=bucket_name)

    def create_buckets(self):
        mem_quota = self.test_config.cluster.mem_quota
        if self.test_config.cluster.num_buckets > 7:
            self.increase_bucket_limit(self.test_config.cluster.num_buckets +
                                       3)

        if self.test_config.cluster.eventing_metadata_bucket_mem_quota:
            mem_quota -= (
                self.test_config.cluster.eventing_metadata_bucket_mem_quota +
                self.test_config.cluster.eventing_bucket_mem_quota)
        per_bucket_quota = mem_quota // self.test_config.cluster.num_buckets

        for master in self.cluster_spec.masters:
            for bucket_name in self.test_config.buckets:
                self.rest.create_bucket(
                    host=master,
                    name=bucket_name,
                    ram_quota=per_bucket_quota,
                    password=self.test_config.bucket.password,
                    replica_number=self.test_config.bucket.replica_number,
                    replica_index=self.test_config.bucket.replica_index,
                    eviction_policy=self.test_config.bucket.eviction_policy,
                    bucket_type=self.test_config.bucket.bucket_type,
                    conflict_resolution_type=self.test_config.bucket.
                    conflict_resolution_type,
                    compression_mode=self.test_config.bucket.compression_mode,
                )

    def create_eventing_buckets(self):
        if not self.test_config.cluster.eventing_bucket_mem_quota:
            return

        per_bucket_quota = \
            self.test_config.cluster.eventing_bucket_mem_quota \
            // self.test_config.cluster.eventing_buckets

        for master in self.cluster_spec.masters:
            for bucket_name in self.test_config.eventing_buckets:
                self.rest.create_bucket(
                    host=master,
                    name=bucket_name,
                    ram_quota=per_bucket_quota,
                    password=self.test_config.bucket.password,
                    replica_number=self.test_config.bucket.replica_number,
                    replica_index=self.test_config.bucket.replica_index,
                    eviction_policy=self.test_config.bucket.eviction_policy,
                    bucket_type=self.test_config.bucket.bucket_type,
                    conflict_resolution_type=self.test_config.bucket.
                    conflict_resolution_type,
                )

    def create_eventing_metadata_bucket(self):
        if not self.test_config.cluster.eventing_metadata_bucket_mem_quota:
            return

        for master in self.cluster_spec.masters:
            self.rest.create_bucket(
                host=master,
                name=self.test_config.cluster.EVENTING_METADATA_BUCKET_NAME,
                ram_quota=self.test_config.cluster.
                eventing_metadata_bucket_mem_quota,
                password=self.test_config.bucket.password,
                replica_number=self.test_config.bucket.replica_number,
                replica_index=self.test_config.bucket.replica_index,
                eviction_policy=self.test_config.bucket.EVICTION_POLICY,
                bucket_type=self.test_config.bucket.BUCKET_TYPE,
            )

    def configure_auto_compaction(self):
        compaction_settings = self.test_config.compaction
        for master in self.cluster_spec.masters:
            self.rest.configure_auto_compaction(master, compaction_settings)
            settings = self.rest.get_auto_compaction_settings(master)
            logger.info('Auto-compaction settings: {}'.format(
                pretty_dict(settings)))

    def configure_internal_settings(self):
        internal_settings = self.test_config.internal_settings
        for master in self.cluster_spec.masters:
            for parameter, value in internal_settings.items():
                self.rest.set_internal_settings(master,
                                                {parameter: maybe_atoi(value)})

    def configure_xdcr_settings(self):
        xdcr_cluster_settings = self.test_config.xdcr_cluster_settings
        for master in self.cluster_spec.masters:
            for parameter, value in xdcr_cluster_settings.items():
                self.rest.set_xdcr_cluster_settings(
                    master, {parameter: maybe_atoi(value)})

    def tweak_memory(self):
        self.remote.reset_swap()
        self.remote.drop_caches()
        self.remote.set_swappiness()
        self.remote.disable_thp()

    def enable_n2n_encryption(self):
        if self.test_config.cluster.enable_n2n_encryption:
            for master in self.cluster_spec.masters:
                self.remote.enable_n2n_encryption(
                    master, self.test_config.cluster.enable_n2n_encryption)

    def restart_with_alternative_num_vbuckets(self):
        num_vbuckets = self.test_config.cluster.num_vbuckets
        if num_vbuckets is not None:
            self.remote.restart_with_alternative_num_vbuckets(num_vbuckets)

    def restart_with_alternative_bucket_options(self):
        """Apply custom buckets settings.

        Tune bucket settings (e.g., max_num_shards or max_num_auxio) using
        "/diag/eval" and restart the entire cluster.
        """
        if self.test_config.bucket_extras:
            self.remote.enable_nonlocal_diag_eval()

        cmd = 'ns_bucket:update_bucket_props("{}", ' \
              '[{{extra_config_string, "{}={}"}}]).'

        for option, value in self.test_config.bucket_extras.items():
            if option == 'num_writer_threads':
                self.rest.set_num_writer_threads(self.master_node, int(value))
            elif option == 'num_reader_threads':
                self.rest.set_num_reader_threads(self.master_node, int(value))
            else:
                logger.info('Changing {} to {}'.format(option, value))
                for master in self.cluster_spec.masters:
                    for bucket in self.test_config.buckets:
                        diag_eval = cmd.format(bucket, option, value)
                        self.rest.run_diag_eval(master, diag_eval)

        if self.test_config.bucket_extras:
            self.remote.restart()
            self.wait_until_healthy()

    def tune_logging(self):
        self.remote.tune_log_rotation()
        self.remote.restart()

    def enable_auto_failover(self):
        enabled = self.test_config.bucket.autofailover_enabled
        failover_min = self.test_config.bucket.failover_min
        failover_max = self.test_config.bucket.failover_max
        for master in self.cluster_spec.masters:
            self.rest.set_auto_failover(master, enabled, failover_min,
                                        failover_max)

    def wait_until_warmed_up(self):
        if self.test_config.bucket.bucket_type in ('ephemeral', 'memcached'):
            return

        for master in self.cluster_spec.masters:
            for bucket in self.test_config.buckets:
                self.monitor.monitor_warmup(self.memcached, master, bucket)

    def wait_until_healthy(self):
        for master in self.cluster_spec.masters:
            self.monitor.monitor_node_health(master)

            for analytics_node in self.rest.get_active_nodes_by_role(
                    master, 'cbas'):
                self.monitor.monitor_analytics_node_active(analytics_node)

    def gen_disabled_audit_events(self, master: str) -> List[str]:
        curr_settings = self.rest.get_audit_settings(master)
        curr_disabled = {str(event) for event in curr_settings['disabled']}
        disabled = curr_disabled - self.test_config.audit_settings.extra_events
        return list(disabled)

    def enable_audit(self):
        if not self.is_compatible(min_release='4.0.0') or \
                self.rest.is_community(self.master_node):
            return

        if not self.test_config.audit_settings.enabled:
            return

        for master in self.cluster_spec.masters:
            disabled = []
            if self.test_config.audit_settings.extra_events:
                disabled = self.gen_disabled_audit_events(master)
            self.rest.enable_audit(master, disabled)

    def generate_ce_roles(self) -> List[str]:
        return ['admin']

    def generate_ee_roles(self) -> List[str]:
        existing_roles = {
            r['role']
            for r in self.rest.get_rbac_roles(self.master_node)
        }

        roles = []
        for role in (
                'bucket_admin',
                'data_dcp_reader',
                'data_monitoring',
                'data_reader_writer',
                'data_reader',
                'data_writer',
                'fts_admin',
                'fts_searcher',
                'query_delete',
                'query_insert',
                'query_select',
                'query_update',
                'views_admin',
        ):
            if role in existing_roles:
                roles.append(role + '[{bucket}]')

        return roles

    def delete_rbac_users(self):
        if not self.is_compatible(min_release='5.0'):
            return

        for master in self.cluster_spec.masters:
            for bucket in self.test_config.buckets:
                self.rest.delete_rbac_user(host=master, bucket=bucket)

    def add_rbac_users(self):
        if not self.rest.supports_rbac(self.master_node):
            logger.info('RBAC not supported - skipping adding RBAC users')
            return

        if self.rest.is_community(self.master_node):
            roles = self.generate_ce_roles()
        else:
            roles = self.generate_ee_roles()

        for master in self.cluster_spec.masters:
            admin_user, admin_password = self.cluster_spec.rest_credentials
            self.rest.add_rbac_user(
                host=master,
                user=admin_user,
                password=admin_password,
                roles=['admin'],
            )

            for bucket in self.test_config.buckets:
                bucket_roles = [role.format(bucket=bucket) for role in roles]
                bucket_roles.append("admin")
                self.rest.add_rbac_user(
                    host=master,
                    user=bucket,  # Backward compatibility
                    password=self.test_config.bucket.password,
                    roles=bucket_roles,
                )

    def throttle_cpu(self):
        if self.remote.os == 'Cygwin':
            return

        self.remote.enable_cpu()

        if self.test_config.cluster.online_cores:
            self.remote.disable_cpu(self.test_config.cluster.online_cores)

    def tune_memory_settings(self):
        kernel_memory = self.test_config.cluster.kernel_mem_limit
        if kernel_memory:
            for service in self.test_config.cluster.kernel_mem_limit_services:
                for server in self.cluster_spec.servers_by_role(service):
                    self.remote.tune_memory_settings(host_string=server,
                                                     size=kernel_memory)
            self.monitor.wait_for_servers()

    def reset_memory_settings(self):
        for service in self.test_config.cluster.kernel_mem_limit_services:
            for server in self.cluster_spec.servers_by_role(service):
                self.remote.reset_memory_settings(host_string=server)
        self.monitor.wait_for_servers()

    def flush_iptables(self):
        self.remote.flush_iptables()

    def clear_login_history(self):
        self.remote.clear_wtmp()

    def disable_wan(self):
        self.remote.disable_wan()

    def enable_ipv6(self):
        if self.test_config.cluster.ipv6:
            self.remote.enable_ipv6()

    def set_x509_certificates(self):
        logger.info('Setting x509 settings')
        if self.test_config.access_settings.ssl_mode == "auth":
            self.remote.setup_x509()
            for host in self.cluster_spec.servers:
                self.rest.upload_cluster_certificate(host)
            for host in self.cluster_spec.servers:
                self.rest.reload_cluster_certificate(host)
                self.rest.enable_certificate_auth(host)

    def set_cipher_suite(self):
        if self.test_config.access_settings.cipher_list:
            check_cipher_suit = self.rest.get_cipher_suite(self.master_node)
            logger.info('current cipher suit: {}'.format(check_cipher_suit))
            self.rest.set_cipher_suite(
                self.master_node, self.test_config.access_settings.cipher_list)
            check_cipher_suit = self.rest.get_cipher_suite(self.master_node)
            logger.info('new cipher suit: {}'.format(check_cipher_suit))

    def set_min_tls_version(self):
        if self.test_config.access_settings.min_tls_version:
            check_tls_version = self.rest.get_minimum_tls_version(
                self.master_node)
            logger.info('current tls version: {}'.format(check_tls_version))
            self.rest.set_minimum_tls_version(
                self.master_node,
                self.test_config.access_settings.min_tls_version)
            check_tls_version = self.rest.get_minimum_tls_version(
                self.master_node)
            logger.info('new tls version: {}'.format(check_tls_version))
Example #51
0
class CouchbaseInstaller:

    def __init__(self, cluster_spec, options):
        self.remote = RemoteHelper(cluster_spec, options.verbose)
        self.options = options
        self.cluster_spec = cluster_spec

    @property
    def url(self) -> str:
        if validators.url(self.options.couchbase_version):
            return self.options.couchbase_version
        else:
            return self.find_package(edition=self.options.edition)

    @property
    def release(self) -> str:
        return self.options.couchbase_version.split('-')[0]

    @property
    def build(self) -> str:
        split = self.options.couchbase_version.split('-')
        if len(split) > 1:
            return split[1]

    def find_package(self, edition: str,
                     package: str = None, os_release: str = None) -> [str, str]:
        for url in self.url_iterator(edition, package, os_release):
            if self.is_exist(url):
                return url
        logger.interrupt('Target build not found')

    def url_iterator(self, edition: str,
                     package: str = None, os_release: str = None) -> Iterator[str]:
        if package is None:
            if self.remote.package == 'rpm':
                if self.cluster_spec.cloud_infrastructure:
                    os_arch = self.cluster_spec.infrastructure_settings.get('os_arch', 'x86_64')
                    if os_arch == 'arm':
                        os_release = 'amzn2.aarch64'
                    elif os_arch == 'al2':
                        os_release == 'amzn2.x86_64'
                    else:
                        os_release = self.remote.detect_centos_release()
                else:
                    os_release = self.remote.detect_centos_release()
            elif self.remote.package == 'deb':
                os_release = self.remote.detect_ubuntu_release()
            package = self.remote.package

        for pkg_pattern in PKG_PATTERNS[package]:
            for loc_pattern in LOCATIONS:
                url = loc_pattern + pkg_pattern
                yield url.format(release=self.release, build=self.build,
                                 edition=edition, os=os_release)

    @staticmethod
    def is_exist(url):
        try:
            status_code = requests.head(url).status_code
        except ConnectionError:
            return False
        if status_code == 200:
            return True
        return False

    def download(self):
        """Download and save a copy of the specified package."""
        if self.remote.package == 'rpm':
            logger.info('Saving a local copy of {}'.format(self.url))
            with open('couchbase.rpm', 'wb') as fh:
                resp = requests.get(self.url)
                fh.write(resp.content)
        else:
            logger.interrupt('Unsupported package format')

    def download_local(self, local_copy_url: str = None):
        """Download and save a copy of the specified package."""
        try:
            if RemoteHelper.detect_server_os("127.0.0.1", self.cluster_spec).\
                    upper() in ('UBUNTU', 'DEBIAN'):
                os_release = detect_ubuntu_release()
                if local_copy_url:
                    url = local_copy_url
                else:
                    url = self.find_package(edition=self.options.edition,
                                            package="deb", os_release=os_release)
                logger.info('Saving a local copy of {}'.format(url))
                with open('couchbase.deb', 'wb') as fh:
                    resp = requests.get(url)
                    fh.write(resp.content)
        except (Exception, BaseException):
            logger.info("Saving local copy for ubuntu failed, package may not present")

    def download_remote(self):
        """Download and save a copy of the specified package on a remote client."""
        if self.remote.package == 'rpm':
            logger.info('Saving a remote copy of {}'.format(self.url))
            self.wget(url=self.url)
        else:
            logger.interrupt('Unsupported package format')

    @master_client
    def wget(self, url):
        logger.info('Fetching {}'.format(url))
        with cd('/tmp'):
            run('wget -nc "{}"'.format(url))
            package = url.split('/')[-1]
            run('mv {} couchbase.rpm'.format(package))

    def kill_processes(self):
        self.remote.kill_processes()

    def uninstall_package(self):
        self.remote.uninstall_couchbase()

    def clean_data(self):
        self.remote.clean_data()

    def install_package(self):
        logger.info('Using this URL: {}'.format(self.url))
        self.remote.upload_iss_files(self.release)
        self.remote.install_couchbase(self.url)

    def install(self):
        self.kill_processes()
        self.uninstall_package()
        self.clean_data()
        self.install_package()
Example #52
0
 def __init__(self, cluster_spec, test_config, verbose):
     self.cluster_spec = cluster_spec
     self.test_config = test_config
     self.remote = RemoteHelper(cluster_spec, test_config, verbose)
Example #53
0
class OperatorInstaller:

    def __init__(self, cluster_spec, options):
        self.options = options
        self.cluster_spec = cluster_spec

        self.operator_version = self.options.operator_version
        if "-" in self.operator_version:
            self.operator_release = self.operator_version.split("-")[0]
            self.operator_tag = 'registry.gitlab.com/cb-vanilla/operator:{}'\
                .format(self.operator_version)
            self.admission_controller_release = self.operator_version.split("-")[0]
            self.admission_controller_tag = \
                'registry.gitlab.com/cb-vanilla/admission-controller:{}' \
                .format(self.operator_version)
        else:
            self.operator_release = self.operator_version
            self.operator_tag = 'couchbase/operator:{}'\
                .format(self.operator_version)
            self.admission_controller_release = self.operator_version
            self.admission_controller_tag = 'couchbase/admission-controller:{}' \
                .format(self.operator_version)

        self.couchbase_version = self.options.couchbase_version
        if "-" in self.couchbase_version:
            self.couchbase_release = self.couchbase_version.split("-")[0]
            self.couchbase_tag = 'registry.gitlab.com/cb-vanilla/server:{}'\
                .format(self.couchbase_version)
        else:
            self.couchbase_release = self.couchbase_version
            self.couchbase_tag = 'couchbase/server:{}'\
                .format(self.couchbase_version)

        self.operator_backup_version = self.options.operator_backup_version
        if self.operator_backup_version:
            if "-" in self.operator_backup_version:
                self.operator_backup_release = self.operator_backup_version.split("-")[0]
                self.operator_backup_tag = 'registry.gitlab.com/cb-vanilla/operator-backup:{}'\
                    .format(self.operator_backup_version)
            else:
                self.operator_backup_release = self.operator_backup_version
                self.operator_backup_tag = 'couchbase/operator-backup/{}'\
                    .format(self.operator_backup_version)
        else:
            self.operator_backup_tag = 'registry.gitlab.com/cb-vanilla/operator-backup:latest'

        self.node_count = len(self.cluster_spec.infrastructure_clusters['couchbase1'].split())

        self.remote = RemoteHelper(cluster_spec)

        self.docker_config_path = os.path.expanduser("~") + "/.docker/config.json"
        self.operator_base_path = "cloud/operator/{}/{}"\
            .format(self.operator_release.split(".")[0],
                    self.operator_release.split(".")[1])
        self.certificate_authority_path = "{}/ca.crt"\
            .format(self.operator_base_path)
        self.crd_path = "{}/crd.yaml"\
            .format(self.operator_base_path)
        self.config_path = "{}/config.yaml"\
            .format(self.operator_base_path)
        self.config_template_path = "{}/config_template.yaml"\
            .format(self.operator_base_path)
        self.auth_path = "{}/auth_secret.yaml"\
            .format(self.operator_base_path)
        self.cb_cluster_path = "{}/couchbase-cluster.yaml"\
            .format(self.operator_base_path)
        self.template_cb_cluster_path = "{}/couchbase-cluster_template.yaml"\
            .format(self.operator_base_path)
        self.worker_base_path = "cloud/worker"
        self.worker_path = "{}/worker.yaml"\
            .format(self.worker_base_path)
        self.rmq_base_path = "cloud/broker/rabbitmq/0.48"
        self.rmq_operator_path = "{}/cluster-operator.yaml"\
            .format(self.rmq_base_path)
        self.rmq_cluster_path = "{}/rabbitmq.yaml"\
            .format(self.rmq_base_path)

    def install(self):
        self.install_operator()
        self.install_celery_broker()

    def install_operator(self):
        logger.info("installing operator")
        self.create_secrets()
        self.create_crd()
        self.create_config()
        self.wait_for_operator_and_admission()
        self.create_auth()
        self.create_cluster()
        self.wait_for_cluster()

    def install_celery_broker(self):
        logger.info("installing celery broker")
        self.create_rabbitmq_operator()
        self.wait_for_rabbitmq_operator()
        self.create_rabbitmq_cluster()
        self.wait_for_rabbitmq_cluster()
        self.creating_rabbitmq_config()

    def uninstall(self):
        self.uninstall_operator()
        self.uninstall_celery_broker()
        self.uninstall_workers()
        self.delete_artifacts()

    def uninstall_operator(self):
        logger.info("uninstalling operator")
        self.delete_operator_files()
        self.delete_operator_secrets()
        self.wait_for_operator_deletion()

    def uninstall_celery_broker(self):
        logger.info("uninstalling celery broker")
        self.delete_rabbitmq_files()
        self.wait_for_rabbitmq_deletion()

    def uninstall_workers(self):
        logger.info("uninstall workers")
        self.delete_worker_files()
        self.wait_for_worker_deletion()

    def create_secrets(self):
        logger.info("creating secrets")
        self.remote.create_docker_secret(
            self.docker_config_path)
        self.remote.create_operator_tls_secret(
            self.certificate_authority_path)

    def create_crd(self):
        logger.info("creating CRD")
        self.remote.create_from_file(self.crd_path)

    def create_config(self):
        logger.info("creating config")
        self.remote.create_operator_config(
            self.config_template_path,
            self.config_path,
            self.operator_tag,
            self.admission_controller_tag)

    def create_auth(self):
        logger.info("creating auth")
        self.remote.create_from_file(self.auth_path)

    def create_cluster(self):
        logger.info("creating couchbase cluster")
        self.remote.create_couchbase_cluster(
            self.template_cb_cluster_path,
            self.cb_cluster_path,
            self.couchbase_tag,
            self.operator_backup_tag,
            self.node_count)

    def wait_for_operator_and_admission(self):
        logger.info("waiting for operator and admission controller")
        self.remote.wait_for_admission_controller_ready()
        self.remote.wait_for_operator_ready()

    def wait_for_cluster(self):
        logger.info("waiting for cluster")
        self.remote.wait_for_couchbase_pods_ready(self.node_count)

    def create_rabbitmq_operator(self):
        logger.info("creating rabbitmq operator")
        self.remote.create_from_file(self.rmq_operator_path)

    def wait_for_rabbitmq_operator(self):
        logger.info("waiting for rabbitmq operator")
        self.remote.wait_for_rabbitmq_operator_ready()

    def create_rabbitmq_cluster(self):
        logger.info("creating rabbitmq cluster")
        self.remote.create_from_file(self.rmq_cluster_path)

    def wait_for_rabbitmq_cluster(self):
        logger.info("waiting for rabbitmq cluster")
        self.remote.wait_for_rabbitmq_broker_ready()

    def creating_rabbitmq_config(self):
        logger.info("creating rabbitmq config")
        self.remote.upload_rabbitmq_config()

    def delete_operator_files(self):
        logger.info("deleting operator files")
        files = [self.cb_cluster_path, self.auth_path,
                 self.config_path,  self.crd_path]
        self.remote.delete_from_files(files)

    def delete_operator_secrets(self):
        logger.info("deleting operator secrets")
        secrets = ['regcred', 'couchbase-operator-tls',
                   'couchbase-server-tls', 'user-password-secret']
        self.remote.delete_secrets(secrets)

    def wait_for_operator_deletion(self):
        logger.info("waiting for operator deletion")
        self.remote.wait_for_operator_deletion()

    def delete_rabbitmq_files(self):
        logger.info("deleting rabbit mq files")
        self.remote.delete_from_files(
            [self.rmq_cluster_path,
             self.rmq_operator_path])

    def wait_for_rabbitmq_deletion(self):
        logger.info("waiting for rabbitmq deletion")
        self.remote.wait_for_rabbitmq_deletion()

    def delete_worker_files(self):
        logger.info("deleting worker files")
        self.remote.delete_from_file(self.worker_path)

    def wait_for_worker_deletion(self):
        logger.info("waiting for worker deletion")
        self.remote.wait_for_workers_deletion()

    def delete_artifacts(self):
        logger.info("deleting any artifact pods, pvcs, and backups")
        self.remote.delete_all_backups()
        self.remote.delete_all_pods()
        self.remote.delete_all_pvc()
Example #54
0
class CbAgent(object):
    def __init__(self, test):
        self.clusters = OrderedDict()
        self.remote = RemoteHelper(test.cluster_spec, test.test_config, verbose=True)

        for cluster_name, servers in test.cluster_spec.yield_clusters():
            cluster = "{}_{}_{}".format(cluster_name, test.build.replace(".", ""), uhex()[:3])
            master = servers[0].split(":")[0]
            self.clusters[cluster] = master
        if test.test_config.test_case.monitor_clients:
            for node in test.cluster_spec.workers:
                cluster = "{}{}".format(self.clusters.items()[0][0][:-3], uhex()[:3])
                master = node.split(":")[0]
                self.clusters[cluster] = master

        self.index_node = ""
        for _, servers in test.cluster_spec.yield_servers_by_role("index"):
            if servers:
                self.index_node = servers[0].split(":")[0]

        if hasattr(test, "ALL_BUCKETS"):
            buckets = None
        else:
            buckets = test.test_config.buckets[:1]
        if hasattr(test, "ALL_HOSTNAMES"):
            hostnames = tuple(test.cluster_spec.yield_hostnames())
        else:
            hostnames = None

        self.settings = type(
            "settings",
            (object,),
            {
                "seriesly_host": test.test_config.stats_settings.seriesly["host"],
                "cbmonitor_host_port": test.test_config.stats_settings.cbmonitor["host"],
                "interval": test.test_config.stats_settings.interval,
                "secondary_statsfile": test.test_config.stats_settings.secondary_statsfile,
                "buckets": buckets,
                "hostnames": hostnames,
                "sync_gateway_nodes": test.remote.gateways if test.remote else None,
                "monitor_clients": test.cluster_spec.workers if test.test_config.test_case.monitor_clients else None,
            },
        )()
        self.lat_interval = test.test_config.stats_settings.lat_interval
        if test.cluster_spec.ssh_credentials:
            self.settings.ssh_username, self.settings.ssh_password = test.cluster_spec.ssh_credentials
        self.settings.rest_username, self.settings.rest_password = test.cluster_spec.rest_credentials
        self.settings.bucket_password = test.test_config.bucket.password

        self.settings.index_node = self.index_node

        self.collectors = []
        self.processes = []
        self.snapshots = []
        self.bandwidth = False

    def prepare_collectors(
        self,
        test,
        bandwidth=False,
        subdoc_latency=False,
        latency=False,
        secondary_stats=False,
        query_latency=False,
        spatial_latency=False,
        n1ql_latency=False,
        n1ql_stats=False,
        index_latency=False,
        persist_latency=False,
        replicate_latency=False,
        xdcr_lag=False,
        secondary_latency=False,
        secondary_debugstats=False,
    ):
        clusters = self.clusters.keys()
        self.bandwidth = bandwidth
        self.prepare_ns_server(clusters)
        self.prepare_active_tasks(clusters)
        if test.remote is None or test.remote.os != "Cygwin":
            self.prepare_ps(clusters)
            self.prepare_net(clusters)
            self.prepare_iostat(clusters, test)
        elif test.remote.os == "Cygwin":
            self.prepare_tp(clusters)
        if subdoc_latency:
            self.prepare_subdoc_latency(clusters, test)
        if latency:
            self.prepare_latency(clusters, test)
        if query_latency:
            self.prepare_query_latency(clusters, test)
        if spatial_latency:
            self.prepare_spatial_latency(clusters, test)
        if n1ql_latency:
            self.prepare_n1ql_latency(clusters, test)
        if secondary_stats:
            self.prepare_secondary_stats(clusters)
        if secondary_debugstats:
            self.prepare_secondary_debugstats(clusters)
        if secondary_latency:
            self.prepare_secondary_latency(clusters)
        if n1ql_stats:
            self.prepare_n1ql_stats(clusters)
        if index_latency:
            self.prepare_index_latency(clusters)
        if persist_latency:
            self.prepare_persist_latency(clusters)
        if replicate_latency:
            self.prepare_replicate_latency(clusters)
        if xdcr_lag:
            self.prepare_xdcr_lag(clusters)

    def prepare_ns_server(self, clusters):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            collector = NSServer(settings)
            try:
                sum(1 for _ in collector.get_buckets())
                self.collectors.append(collector)
            except RuntimeError:
                pass

    def prepare_secondary_stats(self, clusters):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            self.collectors.append(SecondaryStats(settings))

    def prepare_secondary_debugstats(self, clusters):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            self.collectors.append(SecondaryDebugStats(settings))

    def prepare_secondary_latency(self, clusters):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            self.collectors.append(SecondaryLatencyStats(settings))

    def prepare_n1ql_stats(self, clusters):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            self.collectors.append(N1QLStats(settings))

    def prepare_ps(self, clusters):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            ps_collector = PS(settings)
            self.collectors.append(ps_collector)

    def prepare_tp(self, clusters):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            tp_collector = TypePerf(settings)
            self.collectors.append(tp_collector)

    def prepare_net(self, clusters):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            net_collector = Net(settings)
            self.collectors.append(net_collector)

    def prepare_iostat(self, clusters, test):
        # If tests are run locally, no paths are defined, hence
        # use the paths that are set by the server itself. Get
        # those paths via ther REST API
        rest = None

        if test.cluster_spec.paths:
            data_path, index_path = test.cluster_spec.paths
        else:
            rest = RestHelper(test.cluster_spec)

        for cluster in clusters:
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]

            if rest is not None:
                data_path, index_path = rest.get_data_path(settings.master_node)

            partitions = {"data": data_path}
            if hasattr(test, "ddocs"):  # all instances of IndexTest have it
                partitions["index"] = index_path

            settings.partitions = partitions
            io_collector = IO(settings)
            self.collectors.append(io_collector)

    def prepare_persist_latency(self, clusters):
        for i, cluster in enumerate(clusters):
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            settings.observe = "persist"
            self.collectors.append(ObserveLatency(settings))

    def prepare_replicate_latency(self, clusters):
        for i, cluster in enumerate(clusters):
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            settings.observe = "replicate"
            self.collectors.append(ObserveLatency(settings))

    def prepare_index_latency(self, clusters):
        for i, cluster in enumerate(clusters):
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            settings.observe = "index"
            self.collectors.append(ObserveLatency(settings))

    def prepare_xdcr_lag(self, clusters):
        reversed_clusters = list(reversed(self.clusters.keys()))
        for i, cluster in enumerate(clusters):
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            dest_cluster = reversed_clusters[i]
            settings.dest_master_node = self.clusters[dest_cluster]
            self.collectors.append(XdcrLag(settings))

    def prepare_latency(self, clusters, test):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.interval = self.lat_interval
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            prefix = test.target_iterator.prefix or target_hash(settings.master_node.split(":")[0])
            self.collectors.append(SpringLatency(settings, test.workload, prefix))

    def prepare_subdoc_latency(self, clusters, test):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.interval = self.lat_interval
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            prefix = test.target_iterator.prefix or target_hash(settings.master_node.split(":")[0])
            self.collectors.append(SpringSubdocLatency(settings, test.workload, prefix))

    def prepare_query_latency(self, clusters, test):
        params = test.test_config.index_settings.params
        index_type = test.test_config.index_settings.index_type
        for cluster in clusters:
            settings = copy(self.settings)
            settings.interval = self.lat_interval
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            prefix = test.target_iterator.prefix or target_hash(settings.master_node.split(":")[0])
            self.collectors.append(
                SpringQueryLatency(
                    settings, test.workload, prefix=prefix, ddocs=test.ddocs, params=params, index_type=index_type
                )
            )

    def prepare_spatial_latency(self, clusters, test):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.interval = self.lat_interval
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            prefix = test.target_iterator.prefix or target_hash(settings.master_node.split(":")[0])
            self.collectors.append(
                SpringSpatialQueryLatency(
                    settings, test.workload, prefix=prefix, spatial_settings=test.test_config.spatial_settings
                )
            )

    def prepare_n1ql_latency(self, clusters, test):
        default_queries = test.test_config.access_settings.n1ql_queries
        self.settings.new_n1ql_queries = getattr(test, "n1ql_queries", default_queries)
        for cluster in clusters:
            settings = copy(self.settings)
            settings.interval = self.lat_interval
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            self.collectors.append(SpringN1QLQueryLatency(settings, test.workload, prefix="n1ql"))

    def prepare_active_tasks(self, clusters):
        for cluster in clusters:
            settings = copy(self.settings)
            settings.cluster = cluster
            settings.master_node = self.clusters[cluster]
            collector = ActiveTasks(settings)
            try:
                sum(1 for _ in collector.get_buckets())
                self.collectors.append(collector)
            except RuntimeError:
                pass

    def update_metadata(self):
        for collector in self.collectors:
            collector.update_metadata()

    def start(self):
        if self.bandwidth:
            self.remote.start_bandwidth_monitor()
        self.processes = [Process(target=c.collect) for c in self.collectors]
        map(lambda p: p.start(), self.processes)

    def stop(self):
        map(lambda p: p.terminate(), self.processes)
        if self.bandwidth:
            self.remote.stop_bandwidth_monitor()
        return datetime.utcnow()

    def trigger_reports(self, snapshot):
        for report_type in ("html", "get_corr_matrix"):
            url = "http://{}/reports/{}/?snapshot={}".format(self.settings.cbmonitor_host_port, report_type, snapshot)
            logger.info(url)
            requests.get(url=url)

    def add_snapshot(self, phase, ts_from, ts_to):
        for i, cluster in enumerate(self.clusters, start=1):
            snapshot = "{}_{}".format(cluster, phase)
            self.settings.cluster = cluster
            md_client = MetadataClient(self.settings)
            md_client.add_snapshot(snapshot, ts_from, ts_to)
            self.snapshots.append(snapshot)
            self.trigger_reports(snapshot)
Example #55
0
class KubernetesRestHelper(RestBase):
    def __init__(self, cluster_spec: ClusterSpec):
        super().__init__(cluster_spec=cluster_spec)
        self.remote = RemoteHelper(cluster_spec)
        self.ip_table, self.port_translation = self.remote.get_ip_port_mapping(
        )

    def translate_host_and_port(self, host, port):
        trans_host = self.ip_table.get(host)
        trans_port = self.port_translation.get(trans_host).get(str(port))
        return trans_host, trans_port

    def exec_n1ql_statement(self, host: str, statement: str) -> dict:
        host, port = self.translate_host_and_port(host, '8093')
        api = 'http://{}:{}/query/service'\
            .format(host, port)
        data = {
            'statement': statement,
        }
        response = self.post(url=api, data=data)
        return response.json()

    # indexer endpoints not yet exposed by operator
    def get_index_status(self, host: str) -> dict:
        return {'status': [{'status': 'Ready'}]}

    # indexer endpoints not yet exposed by operator
    def get_gsi_stats(self, host: str) -> dict:
        return {'num_docs_queued': 0, 'num_docs_pending': 0}

    def get_active_nodes_by_role(self, master_node: str,
                                 role: str) -> List[str]:
        active_nodes_by_role = []
        for node in self.cluster_spec.servers_by_role(role):
            active_nodes_by_role.append(node)
        return active_nodes_by_role

    def node_statuses(self, host: str) -> dict:
        host, port = self.translate_host_and_port(host, '8091')
        api = 'http://{}:{}/nodeStatuses'\
            .format(host, port)
        data = self.get(url=api).json()
        return {node: info['status'] for node, info in data.items()}

    def get_version(self, host: str) -> str:
        logger.info('Getting Couchbase Server version')
        host, port = self.translate_host_and_port(host, '8091')
        api = 'http://{}:{}/pools/'\
            .format(host, port)
        r = self.get(url=api).json()
        return r['implementationVersion'] \
            .replace('-rel-enterprise', '') \
            .replace('-enterprise', '') \
            .replace('-community', '')

    def get_bucket_stats(self, host: str, bucket: str) -> dict:
        host, port = self.translate_host_and_port(host, '8091')
        api = 'http://{}:{}/pools/default/buckets/{}/stats'\
            .format(host, port, bucket)
        return self.get(url=api).json()

    def get_bucket_info(self, host: str, bucket: str) -> List[str]:
        host, port = self.translate_host_and_port(host, '8091')
        api = 'http://{}:{}/pools/default/buckets/{}'\
            .format(host, port, bucket)
        return self.get(url=api).json()
Example #56
0
class DefaultMonitor(DefaultRestHelper):

    MAX_RETRY = 150
    MAX_RETRY_RECOVERY = 1200
    MAX_RETRY_TIMER_EVENT = 18000
    MAX_RETRY_BOOTSTRAP = 1200

    MONITORING_DELAY = 5

    POLLING_INTERVAL = 2
    POLLING_INTERVAL_INDEXING = 1
    POLLING_INTERVAL_MACHINE_UP = 10
    POLLING_INTERVAL_ANALYTICS = 15
    POLLING_INTERVAL_EVENTING = 1

    REBALANCE_TIMEOUT = 3600 * 6
    TIMEOUT = 3600 * 12

    DISK_QUEUES = (
        'ep_queue_size',
        'ep_flusher_todo',
        'ep_diskqueue_items',
        'vb_active_queue_size',
        'vb_replica_queue_size',
    )

    DCP_QUEUES = (
        'ep_dcp_replica_items_remaining',
        'ep_dcp_other_items_remaining',
    )

    XDCR_QUEUES = ('replication_changes_left', )

    def __init__(self, cluster_spec, test_config, verbose):
        super().__init__(cluster_spec=cluster_spec)
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.remote = RemoteHelper(cluster_spec, verbose)
        self.master_node = next(cluster_spec.masters)
        self.build = self.get_version(self.master_node)
        version, build_number = self.build.split('-')
        self.build_version_number = tuple(map(
            int, version.split('.'))) + (int(build_number), )

    def monitor_rebalance(self, host):
        logger.info('Monitoring rebalance status')

        is_running = True
        last_progress = 0
        last_progress_time = time.time()
        while is_running:
            time.sleep(self.POLLING_INTERVAL)

            is_running, progress = self.get_task_status(host,
                                                        task_type='rebalance')
            if progress == last_progress:
                if time.time() - last_progress_time > self.REBALANCE_TIMEOUT:
                    logger.error('Rebalance hung')
                    break
            else:
                last_progress = progress
                last_progress_time = time.time()

            if progress is not None:
                logger.info('Rebalance progress: {} %'.format(progress))

        logger.info('Rebalance completed')

    def _wait_for_empty_queues(self, host, bucket, queues, stats_function):
        metrics = list(queues)

        start_time = time.time()
        while metrics:
            bucket_stats = stats_function(host, bucket)
            # As we are changing metrics in the loop; take a copy of it to
            # iterate over.
            for metric in list(metrics):
                stats = bucket_stats['op']['samples'].get(metric)
                if stats:
                    last_value = stats[-1]
                    if last_value:
                        logger.info('{} = {:,}'.format(metric, last_value))
                        continue
                    else:
                        logger.info('{} reached 0'.format(metric))
                    metrics.remove(metric)
                else:
                    logger.info('{} reached 0'.format(metric))
                    metrics.remove(metric)
            if metrics:
                time.sleep(self.POLLING_INTERVAL)
            if time.time() - start_time > self.TIMEOUT:
                raise Exception('Monitoring got stuck')

    def _wait_for_empty_dcp_queues(self, host, bucket, stats_function):
        start_time = time.time()
        while True:
            kv_dcp_stats = stats_function(host, bucket)
            stats = int(kv_dcp_stats["data"][0]["values"][-1][1])
            if stats:
                logger.info('{} = {}'.format('ep_dcp_replica_items_remaining',
                                             stats))
                if time.time() - start_time > self.TIMEOUT:
                    raise Exception('Monitoring got stuck')
                time.sleep(self.POLLING_INTERVAL)
            else:
                logger.info(
                    '{} reached 0'.format('ep_dcp_replica_items_remaining'))
                break

    def _wait_for_replica_count_match(self, host, bucket):
        start_time = time.time()
        bucket_info = self.get_bucket_info(host, bucket)
        replica_number = int(bucket_info['replicaNumber'])
        while replica_number:
            bucket_stats = self.get_bucket_stats(host, bucket)
            curr_items = bucket_stats['op']['samples'].get("curr_items")[-1]
            replica_curr_items = bucket_stats['op']['samples'].get(
                "vb_replica_curr_items")[-1]
            logger.info("curr_items: {}, replica_curr_items: {}".format(
                curr_items, replica_curr_items))
            if (curr_items * replica_number) == replica_curr_items:
                break
            time.sleep(self.POLLING_INTERVAL)
            if time.time() - start_time > self.TIMEOUT:
                raise Exception('Replica items monitoring got stuck')

    def _wait_for_replication_completion(self, host, bucket, queues,
                                         stats_function, link1, link2):
        metrics = list(queues)

        completion_count = 0
        link1_time = 0
        link2_items = 0

        link1_compelteness_str = \
            'replications/{}/bucket-1/bucket-1/percent_completeness'.format(link1)
        link2_compelteness_str = \
            'replications/{}/bucket-1/bucket-1/percent_completeness'.format(link2)
        link2_items_str = \
            'replications/{}/bucket-1/bucket-1/docs_written'.format(link2)

        start_time = time.time()

        while metrics:
            bucket_stats = stats_function(host, bucket)
            # As we are changing metrics in the loop; take a copy of it to
            # iterate over.
            for metric in list(metrics):
                stats = bucket_stats['op']['samples'].get(metric)
                if stats:
                    last_value = stats[-1]
                    if last_value:
                        logger.info('{} = {:,}'.format(metric, last_value))
                        link1_completeness = \
                            bucket_stats['op']['samples'].get(link1_compelteness_str)[-1]
                        link2_completeness = \
                            bucket_stats['op']['samples'].get(link2_compelteness_str)[-1]
                        if link1_completeness == 100 or \
                                link2_completeness == 100:
                            if link1_completeness == 100:
                                if completion_count == 0:
                                    link1_time = time.time()
                                    link2_items = \
                                        bucket_stats['op']['samples'].get(link2_items_str)[-1]
                                    completion_count = completion_count + 1
                            elif link2_completeness == 100:
                                if completion_count == 0:
                                    link1_time = time.time()
                                    link2_items = \
                                        bucket_stats['op']['samples'].get(link2_items_str)[-1]
                                    completion_count = completion_count + 1
                        continue
                    else:
                        logger.info('{} reached 0'.format(metric))
                        if completion_count == 0:
                            link1_time = time.time()
                            link2_items = \
                                bucket_stats['op']['samples'].get(link2_items_str)[-1]
                            completion_count = completion_count + 1
                    metrics.remove(metric)
            if metrics:
                time.sleep(self.POLLING_INTERVAL)
            if time.time() - start_time > self.TIMEOUT:
                raise Exception('Monitoring got stuck')

        return link1_time, link2_items

    def _wait_for_completeness(self, host, bucket, xdcr_link, stats_function):
        metrics = []
        metrics.append(xdcr_link)

        start_time = time.time()

        while metrics:
            bucket_stats = stats_function(host, bucket)

            for metric in metrics:
                stats = bucket_stats['op']['samples'].get(metric)
                if stats:
                    last_value = stats[0]
                    if last_value != 100:
                        logger.info('{} : {}'.format(metric, last_value))
                    elif last_value == 100:
                        logger.info('{} Completed 100 %'.format(metric))
                        metrics.remove(metric)
            if metrics:
                time.sleep(self.POLLING_INTERVAL)
            if time.time() - start_time > self.TIMEOUT:
                raise Exception('Monitoring got stuck')

    def monitor_disk_queues(self, host, bucket):
        logger.info('Monitoring disk queues: {}'.format(bucket))
        self._wait_for_empty_queues(host, bucket, self.DISK_QUEUES,
                                    self.get_bucket_stats)

    def monitor_dcp_queues(self, host, bucket):
        logger.info('Monitoring DCP queues: {}'.format(bucket))

        if self.build_version_number < (7, 0, 0, 3937):
            self._wait_for_empty_queues(host, bucket, self.DCP_QUEUES,
                                        self.get_bucket_stats)
        else:
            if self.test_config.bucket.replica_number != 0:
                self._wait_for_empty_dcp_queues(host, bucket,
                                                self.get_dcp_replication_items)
            self.DCP_QUEUES = ('ep_dcp_other_items_remaining', )
            self._wait_for_empty_queues(host, bucket, self.DCP_QUEUES,
                                        self.get_bucket_stats)

    def monitor_replica_count(self, host, bucket):
        logger.info('Monitoring replica count match: {}'.format(bucket))
        self._wait_for_replica_count_match(host, bucket)

    def _wait_for_xdcr_to_start(self, host: str):
        is_running = False
        while not is_running:
            time.sleep(self.POLLING_INTERVAL)
            is_running, _ = self.get_task_status(host, task_type='xdcr')

    def xdcr_link_starttime(self, host: str, uuid: str):
        is_running = False
        while not is_running:
            time.sleep(self.POLLING_INTERVAL)
            is_running, _ = self.get_xdcrlink_status(host,
                                                     task_type='xdcr',
                                                     uuid=uuid)
        return time.time()

    def monitor_xdcr_queues(self, host: str, bucket: str):
        logger.info('Monitoring XDCR queues: {}'.format(bucket))
        self._wait_for_xdcr_to_start(host)
        # adding temporary delay to make sure replication_changes_left stats arrives
        time.sleep(20)
        self._wait_for_empty_queues(host, bucket, self.XDCR_QUEUES,
                                    self.get_xdcr_stats)

    def monitor_xdcr_changes_left(self, host: str, bucket: str, xdcrlink1: str,
                                  xdcrlink2: str):
        logger.info('Monitoring XDCR queues: {}'.format(bucket))
        self._wait_for_xdcr_to_start(host)
        start_time = time.time()
        link1_time, link2_items = self._wait_for_replication_completion(
            host, bucket, self.XDCR_QUEUES, self.get_xdcr_stats, xdcrlink1,
            xdcrlink2)
        return start_time, link1_time, link2_items

    def monitor_xdcr_completeness(self, host: str, bucket: str,
                                  xdcr_link: str):
        logger.info('Monitoring XDCR Link Completeness: {}'.format(bucket))
        self._wait_for_completeness(host=host,
                                    bucket=bucket,
                                    xdcr_link=xdcr_link,
                                    stats_function=self.get_xdcr_stats)
        return time.time()

    def get_num_items(self, host: str, bucket: str):
        num_items = self._get_num_items(host, bucket, total=True)
        return num_items

    def _get_num_items(self,
                       host: str,
                       bucket: str,
                       total: bool = False) -> int:
        stats = self.get_bucket_stats(host=host, bucket=bucket)
        if total:
            curr_items = stats['op']['samples'].get('curr_items_tot')
        else:
            curr_items = stats['op']['samples'].get('curr_items')
        if curr_items:
            return curr_items[-1]
        return 0

    def monitor_num_items(self, host: str, bucket: str, num_items: int):
        logger.info('Checking the number of items in {}'.format(bucket))
        retries = 0
        while retries < self.MAX_RETRY:
            curr_items = self._get_num_items(host, bucket, total=True)
            if curr_items == num_items:
                break
            else:
                logger.info('{}(curr_items) != {}(num_items)'.format(
                    curr_items, num_items))
            time.sleep(self.POLLING_INTERVAL)
            retries += 1
        else:
            actual_items = self._get_num_items(host, bucket, total=True)
            raise Exception(
                'Mismatch in the number of items: {}'.format(actual_items))

    def monitor_num_backfill_items(self, host: str, bucket: str,
                                   num_items: int):
        logger.info('Checking the number of items in {}'.format(bucket))
        t0 = time.time()
        while True:
            curr_items = self._get_num_items(host, bucket, total=True)
            if curr_items == num_items:
                t1 = time.time()
                break
            else:
                logger.info('{}(curr_items) != {}(num_items)'.format(
                    curr_items, num_items))
            time.sleep(self.POLLING_INTERVAL)
        return t1 - t0

    def monitor_task(self, host, task_type):
        logger.info('Monitoring task: {}'.format(task_type))
        time.sleep(self.MONITORING_DELAY * 2)

        while True:
            time.sleep(self.POLLING_INTERVAL)

            tasks = [
                task for task in self.get_tasks(host)
                if task.get('type') == task_type
            ]
            if tasks:
                for task in tasks:
                    logger.info('{}: {}%, bucket: {}, ddoc: {}'.format(
                        task_type, task.get('progress'), task.get('bucket'),
                        task.get('designDocument')))
            else:
                break
        logger.info('Task {} successfully completed'.format(task_type))

    def monitor_warmup(self, memcached, host, bucket):
        logger.info('Monitoring warmup status: {}@{}'.format(bucket, host))

        memcached_port = self.get_memcached_port(host)

        while True:
            stats = memcached.get_stats(host, memcached_port, bucket, 'warmup')
            if 'ep_warmup_state' in stats:
                state = stats['ep_warmup_state']
                if state == 'done':
                    return float(stats.get('ep_warmup_time', 0))
                else:
                    logger.info('Warmpup status: {}'.format(state))
                    time.sleep(self.POLLING_INTERVAL)
            else:
                logger.info('No warmup stats are available, continue polling')
                time.sleep(self.POLLING_INTERVAL)

    def monitor_compression(self, memcached, host, bucket):
        logger.info('Monitoring active compression status')

        memcached_port = self.get_memcached_port(host)

        json_docs = -1
        while json_docs:
            stats = memcached.get_stats(host, memcached_port, bucket)
            json_docs = int(stats['ep_active_datatype_json'])
            if json_docs:
                logger.info('Still uncompressed: {:,} items'.format(json_docs))
                time.sleep(self.POLLING_INTERVAL)
        logger.info('All items are compressed')

    def monitor_node_health(self, host):
        logger.info('Monitoring node health')

        for retry in range(self.MAX_RETRY):
            unhealthy_nodes = {
                n
                for n, status in self.node_statuses(host).items()
                if status != 'healthy'
            } | {
                n
                for n, status in self.node_statuses_v2(host).items()
                if status != 'healthy'
            }
            if unhealthy_nodes:
                time.sleep(self.POLLING_INTERVAL)
            else:
                break
        else:
            logger.interrupt(
                'Some nodes are not healthy: {}'.format(unhealthy_nodes))

    def monitor_analytics_node_active(self, host):
        logger.info('Monitoring analytics node health')

        for retry in range(self.MAX_RETRY):
            active = self.analytics_node_active(host)
            if active:
                break
            else:
                time.sleep(self.POLLING_INTERVAL)
        else:
            logger.interrupt(
                'Analytics node still not healthy: {}'.format(host))

    def is_index_ready(self, host: str) -> bool:
        for status in self.get_index_status(host)['status']:
            if status['status'] != 'Ready':
                return False
        return True

    def estimate_pending_docs(self, host: str) -> int:
        stats = self.get_gsi_stats(host)
        pending_docs = 0
        for metric, value in stats.items():
            if 'num_docs_queued' in metric or 'num_docs_pending' in metric:
                pending_docs += value
        return pending_docs

    def monitor_indexing(self, host):
        logger.info('Monitoring indexing progress')

        while not self.is_index_ready(host):
            time.sleep(self.POLLING_INTERVAL_INDEXING * 5)
            pending_docs = self.estimate_pending_docs(host)
            logger.info('Pending docs: {:,}'.format(pending_docs))

        logger.info('Indexing completed')

    def wait_for_secindex_init_build(self, host, indexes):
        # POLL until initial index build is complete
        logger.info("Waiting for the following indexes to be ready: {}".format(
            indexes))

        indexes_ready = [0 for _ in indexes]

        def get_index_status(json2i, index):
            """Return the index status."""
            for d in json2i["status"]:
                if d["name"] == index:
                    return d["status"]
            return None

        @misc.retry(catch=(KeyError, ), iterations=10, wait=30)
        def update_indexes_ready():
            json2i = self.get_index_status(host)
            for i, index in enumerate(indexes):
                status = get_index_status(json2i, index)
                if status == 'Ready':
                    indexes_ready[i] = 1

        init_ts = time.time()
        while sum(indexes_ready) != len(indexes):
            time.sleep(self.POLLING_INTERVAL_INDEXING)
            update_indexes_ready()
        finish_ts = time.time()
        logger.info('secondary index build time: {}'.format(finish_ts -
                                                            init_ts))
        time_elapsed = round(finish_ts - init_ts)
        return time_elapsed

    def wait_for_secindex_init_build_collections(self, host, indexes):
        # POLL until initial index build is complete
        index_list = []
        for bucket_name, scope_map in indexes.items():
            for scope_name, collection_map in scope_map.items():
                for collection_name, index_map in collection_map.items():
                    for index_name in index_map.keys():
                        index_list.append(index_name)
        indexes = index_list
        logger.info("Waiting for the following indexes to be ready: {}".format(
            indexes))

        indexes_ready = [0 for _ in indexes]

        def get_index_status(json2i, index):
            """Return the index status."""
            for d in json2i["status"]:
                if d["name"] == index:
                    return d["status"]
            return None

        @misc.retry(catch=(KeyError, ), iterations=10, wait=30)
        def update_indexes_ready():
            json2i = self.get_index_status(host)
            for i, index in enumerate(indexes):
                status = get_index_status(json2i, index)
                if status == 'Ready':
                    indexes_ready[i] = 1

        while sum(indexes_ready) != len(indexes):
            time.sleep(self.POLLING_INTERVAL_INDEXING * 10)
            update_indexes_ready()
        logger.info('secondary index build complete: {}'.format(indexes))

    def wait_for_secindex_incr_build(self, index_nodes, bucket, indexes,
                                     numitems):
        # POLL until incremenal index build is complete
        logger.info('expecting {} num_docs_indexed for indexes {}'.format(
            numitems, indexes))

        # collect num_docs_indexed information globally from all index nodes
        def get_num_docs_indexed():
            data = self.get_index_stats(index_nodes)
            num_indexed = []
            for index in indexes:
                key = "" + bucket + ":" + index + ":num_docs_indexed"
                val = data[key]
                num_indexed.append(val)
            return num_indexed

        def get_num_docs_index_pending():
            data = self.get_index_stats(index_nodes)
            num_pending = []
            for index in indexes:
                key = "" + bucket + ":" + index + ":num_docs_pending"
                val1 = data[key]
                key = "" + bucket + ":" + index + ":num_docs_queued"
                val2 = data[key]
                val = int(val1) + int(val2)
                num_pending.append(val)
            return num_pending

        expected_num_pending = [0] * len(indexes)
        while True:
            time.sleep(self.POLLING_INTERVAL_INDEXING)
            curr_num_pending = get_num_docs_index_pending()
            if curr_num_pending == expected_num_pending:
                break
        curr_num_indexed = get_num_docs_indexed()
        logger.info("Number of Items indexed {}".format(curr_num_indexed))

    def wait_for_secindex_incr_build_collections(self, index_nodes, index_map,
                                                 expected_num_docs):
        indexes = []
        for bucket_name, scope_map in index_map.items():
            for scope_name, collection_map in scope_map.items():
                for collection_name, coll_index_map in collection_map.items():
                    for index_name, index_def in coll_index_map.items():
                        if scope_name == '_default' \
                                and collection_name == '_default':
                            target_index = "{}:{}".format(
                                bucket_name, index_name)
                        else:
                            target_index = "{}:{}:{}:{}".format(
                                bucket_name, scope_name, collection_name,
                                index_name)
                        indexes.append(target_index)
        logger.info('expecting {} num_docs_indexed for indexes {}'.format(
            expected_num_docs, indexes))

        # collect num_docs_indexed information globally from all index nodes
        def get_num_docs_indexed():
            data = self.get_index_stats(index_nodes)
            num_indexed = []
            for index in indexes:
                key = index + ":num_docs_indexed"
                val = data[key]
                num_indexed.append(val)
            return num_indexed

        def get_num_docs_index_pending():
            data = self.get_index_stats(index_nodes)
            num_pending = []
            for index in indexes:
                key = index + ":num_docs_pending"
                val1 = data[key]
                key = index + ":num_docs_queued"
                val2 = data[key]
                val = int(val1) + int(val2)
                num_pending.append(val)
            return num_pending

        expected_num_pending = [0] * len(indexes)
        while True:
            time.sleep(self.POLLING_INTERVAL_INDEXING * 10)
            curr_num_pending = get_num_docs_index_pending()
            if curr_num_pending == expected_num_pending:
                break

        curr_num_indexed = get_num_docs_indexed()
        logger.info("Number of Items indexed {}".format(curr_num_indexed))

    def wait_for_num_connections(self, index_node, expected_connections):
        curr_connections = self.get_index_num_connections(index_node)
        retry = 1
        while curr_connections < expected_connections and retry < self.MAX_RETRY:
            time.sleep(self.POLLING_INTERVAL_INDEXING)
            curr_connections = self.get_index_num_connections(index_node)
            logger.info("Got current connections {}".format(curr_connections))
            retry += 1
        if retry == self.MAX_RETRY:
            return False
        return True

    def wait_for_recovery(self, index_nodes, bucket, index):
        time.sleep(self.MONITORING_DELAY)
        for retry in range(self.MAX_RETRY_RECOVERY):
            response = self.get_index_stats(index_nodes)
            item = "{}:{}:disk_load_duration".format(bucket, index)
            if item in response:
                return response[item]
            else:
                time.sleep(self.POLLING_INTERVAL)
        return -1

    def wait_for_servers(self):
        for retry in range(self.MAX_RETRY):
            logger.info('Waiting for all servers to be available')
            time.sleep(self.POLLING_INTERVAL_MACHINE_UP)

            for server in self.cluster_spec.servers:
                if not self.remote.is_up(server):
                    break
            else:
                logger.info('All nodes are up')
                return

        logger.interrupt('Some nodes are still down')

    def monitor_fts_indexing_queue(self, host: str, index: str, items: int):
        logger.info('Waiting for indexing to finish')
        count = 0
        while count < items:
            count = self.get_fts_doc_count(host, index)
            logger.info('FTS indexed documents: {:,}'.format(count))
            time.sleep(self.POLLING_INTERVAL)

    def monitor_fts_index_persistence(self,
                                      hosts: list,
                                      index: str,
                                      bkt: str = None):
        logger.info('Waiting for index to be persisted')
        if not bkt:
            bkt = self.test_config.buckets[0]
        tries = 0
        pending_items = 1
        while pending_items:
            try:
                persist = 0
                compact = 0
                for host in hosts:
                    stats = self.get_fts_stats(host)
                    metric = '{}:{}:{}'.format(bkt, index,
                                               'num_recs_to_persist')
                    persist += stats[metric]

                    metric = '{}:{}:{}'.format(bkt, index, 'total_compactions')
                    compact += stats[metric]

                pending_items = persist or compact
                logger.info('Records to persist: {:,}'.format(persist))
                logger.info('Ongoing compactions: {:,}'.format(compact))
            except KeyError:
                tries += 1
            if tries >= 10:
                raise Exception("cannot get fts stats")
            time.sleep(self.POLLING_INTERVAL)

    def monitor_elastic_indexing_queue(self, host: str, index: str):
        logger.info(' Waiting for indexing to finish')
        items = int(self.test_config.fts_settings.test_total_docs)
        count = 0
        while count < items:
            count = self.get_elastic_doc_count(host, index)
            logger.info('Elasticsearch indexed documents: {:,}'.format(count))
            time.sleep(self.POLLING_INTERVAL)

    def monitor_elastic_index_persistence(self, host: str, index: str):
        logger.info('Waiting for index to be persisted')

        pending_items = -1
        while pending_items:
            stats = self.get_elastic_stats(host)
            pending_items = stats['indices'][index]['total']['translog'][
                'operations']
            logger.info('Records to persist: {:,}'.format(pending_items))
            time.sleep(self.POLLING_INTERVAL)

    def wait_for_bootstrap(self, nodes: list, function: str):
        logger.info(
            'Waiting for bootstrap of eventing function: {} '.format(function))
        for node in nodes:
            retry = 1
            while retry < self.MAX_RETRY_BOOTSTRAP:
                if function in self.get_apps_with_status(node, "deployed"):
                    break
                time.sleep(self.POLLING_INTERVAL)
                retry += 1
            if retry == self.MAX_RETRY_BOOTSTRAP:
                logger.info(
                    'Failed to bootstrap function: {}, node: {}'.format(
                        function, node))

    def get_num_analytics_items(self, analytics_node: str, bucket: str) -> int:
        stats_key = '{}:all:incoming_records_count_total'.format(bucket)
        num_items = 0
        for node in self.get_active_nodes_by_role(analytics_node, 'cbas'):
            stats = self.get_analytics_stats(node)
            num_items += stats.get(stats_key, 0)
        return num_items

    def get_num_remaining_mutations(self, analytics_node: str) -> int:
        while True:
            num_items = 0
            try:
                if self.build_version_number < (7, 0, 0, 4622):
                    stats = self.get_pending_mutations(analytics_node)
                    for dataset in stats['Default']:
                        if self.build_version_number < (7, 0, 0, 4310):
                            num_items += int(stats['Default'][dataset])
                        else:
                            num_items += int(
                                stats['Default'][dataset]['seqnoLag'])
                else:
                    stats = self.get_pending_mutations_v2(analytics_node)
                    for scope in stats['scopes']:
                        for collection in scope['collections']:
                            num_items += int(collection['seqnoLag'])
                break
            except Exception:
                time.sleep(self.POLLING_INTERVAL_ANALYTICS)
        return num_items

    def monitor_data_synced(self, data_node: str, bucket: str,
                            analytics_node: str) -> int:
        logger.info('Waiting for data to be synced from {}'.format(data_node))
        time.sleep(self.MONITORING_DELAY * 3)

        num_items = self._get_num_items(data_node, bucket)

        while True:
            if self.build_version_number < (7, 0, 0, 0):
                num_analytics_items = self.get_num_analytics_items(
                    analytics_node, bucket)
            else:
                incoming_records = self.get_cbas_incoming_records_count(
                    analytics_node)
                num_analytics_items = int(
                    incoming_records["data"][0]["values"][-1][1])

            logger.info('Analytics has {:,} docs (target is {:,})'.format(
                num_analytics_items, num_items))

            if self.build_version_number < (6, 5, 0, 0):
                if num_analytics_items == num_items:
                    break
            else:
                num_remaining_mutations = self.get_num_remaining_mutations(
                    analytics_node)
                logger.info('Number of remaining mutations: {}'.format(
                    num_remaining_mutations))

                if num_remaining_mutations == 0:
                    break

            time.sleep(self.POLLING_INTERVAL_ANALYTICS)

        return num_items

    def monitor_dataset_drop(self, analytics_node: str, dataset: str):
        while True:
            statement = "SELECT COUNT(*) from `{}`;".format(dataset)
            result = self.exec_analytics_query(analytics_node, statement)
            num_analytics_items = result['results'][0]['$1']
            logger.info("Number of items in dataset {}: {}".format(
                dataset, num_analytics_items))

            if num_analytics_items == 0:
                break

            time.sleep(self.POLLING_INTERVAL)

    def wait_for_timer_event(self,
                             node: str,
                             function: str,
                             event="timer_events"):
        logger.info('Waiting for timer events to start processing: {} '.format(
            function))
        retry = 1
        while retry < self.MAX_RETRY_TIMER_EVENT:
            if 0 < self.get_num_events_processed(
                    event=event, node=node, name=function):
                break
            time.sleep(self.POLLING_INTERVAL_EVENTING)
            retry += 1
        if retry == self.MAX_RETRY_TIMER_EVENT:
            logger.info(
                'Failed to get timer event for function: {}'.format(function))

    def wait_for_all_mutations_processed(self, host: str, bucket1: str,
                                         bucket2: str):
        logger.info(
            'Waiting for mutations to be processed of eventing function')
        retry = 1
        while retry < self.MAX_RETRY_BOOTSTRAP:
            if self._get_num_items(host=host, bucket=bucket1) == \
                    self._get_num_items(host=host, bucket=bucket2):
                break
            retry += 1
            time.sleep(self.POLLING_INTERVAL_EVENTING)
        if retry == self.MAX_RETRY_BOOTSTRAP:
            logger.info('Failed to process all mutations... TIMEOUT')

    def wait_for_all_timer_creation(self, node: str, function: str):
        logger.info(
            'Waiting for all timers to be created by : {} '.format(function))
        retry = 1
        events_processed = {}
        while retry < self.MAX_RETRY_TIMER_EVENT:
            events_processed = self.get_num_events_processed(event="ALL",
                                                             node=node,
                                                             name=function)
            if events_processed["dcp_mutation"] == events_processed[
                    "timer_responses_received"]:
                break
            time.sleep(self.POLLING_INTERVAL_EVENTING)
            retry += 1
        if retry == self.MAX_RETRY_TIMER_EVENT:
            logger.info('Got only {} timers created for function: {}'.format(
                events_processed["timer_responses_received"], function))

    def wait_for_function_status(self, node: str, function: str, status: str):
        logger.info('Waiting for {} function to {}'.format(function, status))
        retry = 1
        while retry < self.MAX_RETRY_TIMER_EVENT:
            op = self.get_apps_with_status(node, status)
            if function in op:
                break
            time.sleep(self.POLLING_INTERVAL_EVENTING)
            retry += 1
        if retry == self.MAX_RETRY_TIMER_EVENT:
            logger.info('Function {} failed to {}...!!!'.format(
                function, status))
Example #57
0
class PerfTest:

    COLLECTORS = {}

    ROOT_CERTIFICATE = 'root.pem'

    def __init__(self, cluster_spec: ClusterSpec, test_config: TestConfig,
                 verbose: bool):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.target_iterator = TargetIterator(cluster_spec, test_config)

        self.cluster = ClusterManager(cluster_spec, test_config)
        self.memcached = MemcachedHelper(test_config)
        self.monitor = Monitor(cluster_spec, test_config, verbose)
        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec, verbose)
        self.profiler = Profiler(cluster_spec, test_config)

        self.master_node = next(cluster_spec.masters)
        self.build = self.rest.get_version(self.master_node)

        self.metrics = MetricHelper(self)
        self.reporter = ShowFastReporter(cluster_spec, test_config, self.build)

        self.cbmonitor_snapshots = []
        self.cbmonitor_clusters = []

        if self.test_config.test_case.use_workers:
            self.worker_manager = WorkerManager(cluster_spec, test_config,
                                                verbose)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        failure = self.debug()

        self.tear_down()

        if exc_type == KeyboardInterrupt:
            logger.warn('The test was interrupted')
            return True

        if failure:
            logger.interrupt(failure)

    @property
    def query_nodes(self) -> List[str]:
        return self.rest.get_active_nodes_by_role(self.master_node, 'n1ql')

    @property
    def index_nodes(self) -> List[str]:
        return self.rest.get_active_nodes_by_role(self.master_node, 'index')

    @property
    def fts_nodes(self) -> List[str]:
        return self.rest.get_active_nodes_by_role(self.master_node, 'fts')

    @property
    def analytics_nodes(self) -> List[str]:
        return self.rest.get_active_nodes_by_role(self.master_node, 'cbas')

    @property
    def eventing_nodes(self) -> List[str]:
        return self.rest.get_active_nodes_by_role(self.master_node, 'eventing')

    def tear_down(self):
        if self.test_config.test_case.use_workers:
            self.worker_manager.download_celery_logs()
            self.worker_manager.terminate()

        if self.test_config.cluster.online_cores:
            self.remote.enable_cpu()

        if self.test_config.cluster.kernel_mem_limit:
            self.collect_logs()

            self.cluster.reset_memory_settings()

    def collect_logs(self):
        self.remote.collect_info()

        for hostname in self.cluster_spec.servers:
            for fname in glob.glob('{}/*.zip'.format(hostname)):
                shutil.move(fname, '{}.zip'.format(hostname))

    def reset_memory_settings(self):
        if self.test_config.cluster.kernel_mem_limit:
            for service in self.test_config.cluster.kernel_mem_limit_services:
                for server in self.cluster_spec.servers_by_role(service):
                    self.remote.reset_memory_settings(host_string=server)
            self.monitor.wait_for_servers()

    def debug(self) -> str:
        failure = self.check_core_dumps()
        failure = self.check_rebalance() or failure
        return self.check_failover() or failure

    def download_certificate(self):
        cert = self.rest.get_certificate(self.master_node)
        with open(self.ROOT_CERTIFICATE, 'w') as fh:
            fh.write(cert)

    def check_rebalance(self) -> str:
        for master in self.cluster_spec.masters:
            if self.rest.is_not_balanced(master):
                return 'The cluster is not balanced'

    def check_failover(self) -> Optional[str]:
        if hasattr(self, 'rebalance_settings'):
            if self.rebalance_settings.failover or \
                    self.rebalance_settings.graceful_failover:
                return

        for master in self.cluster_spec.masters:
            num_failovers = self.rest.get_failover_counter(master)
            if num_failovers:
                return 'Failover happened {} time(s)'.format(num_failovers)

    def check_core_dumps(self) -> str:
        dumps_per_host = self.remote.detect_core_dumps()
        core_dumps = {
            host: dumps
            for host, dumps in dumps_per_host.items() if dumps
        }
        if core_dumps:
            return pretty_dict(core_dumps)

    def restore(self):
        logger.info('Restoring data')
        self.remote.restore_data(
            self.test_config.restore_settings.backup_storage,
            self.test_config.restore_settings.backup_repo,
        )

    def restore_local(self):
        logger.info('Restoring data')
        local.extract_cb(filename='couchbase.rpm')
        local.cbbackupmgr_restore(
            master_node=self.master_node,
            cluster_spec=self.cluster_spec,
            threads=self.test_config.restore_settings.threads,
            archive=self.test_config.restore_settings.backup_storage,
            repo=self.test_config.restore_settings.backup_repo,
        )

    def import_data(self):
        logger.info('Importing data')
        for bucket in self.test_config.buckets:
            self.remote.import_data(
                self.test_config.restore_settings.import_file,
                bucket,
            )

    def compact_bucket(self, wait: bool = True):
        for target in self.target_iterator:
            self.rest.trigger_bucket_compaction(target.node, target.bucket)

        if wait:
            for target in self.target_iterator:
                self.monitor.monitor_task(target.node, 'bucket_compaction')

    def wait_for_persistence(self):
        for target in self.target_iterator:
            self.monitor.monitor_disk_queues(target.node, target.bucket)
            self.monitor.monitor_dcp_queues(target.node, target.bucket)
            self.monitor.monitor_replica_count(target.node, target.bucket)

    def wait_for_indexing(self):
        if self.test_config.index_settings.statements:
            for server in self.index_nodes:
                self.monitor.monitor_indexing(server)

    def check_num_items(self):
        num_items = self.test_config.load_settings.items * (
            1 + self.test_config.bucket.replica_number)
        for target in self.target_iterator:
            self.monitor.monitor_num_items(target.node, target.bucket,
                                           num_items)

    def reset_kv_stats(self):
        master_node = next(self.cluster_spec.masters)
        for bucket in self.test_config.buckets:
            for server in self.rest.get_server_list(master_node, bucket):
                port = self.rest.get_memcached_port(server)
                self.memcached.reset_stats(server, port, bucket)

    def create_indexes(self):
        logger.info('Creating and building indexes')

        if not self.test_config.index_settings.couchbase_fts_index_name:
            for statement in self.test_config.index_settings.statements:
                self.rest.exec_n1ql_statement(self.query_nodes[0], statement)
        else:
            self.create_fts_index_n1ql()

    def create_fts_index_n1ql(self):
        definition = read_json(
            self.test_config.index_settings.couchbase_fts_index_configfile)
        definition.update(
            {'name': self.test_config.index_settings.couchbase_fts_index_name})
        logger.info('Index definition: {}'.format(pretty_dict(definition)))
        self.rest.create_fts_index(
            self.fts_nodes[0],
            self.test_config.index_settings.couchbase_fts_index_name,
            definition)
        self.monitor.monitor_fts_indexing_queue(
            self.fts_nodes[0],
            self.test_config.index_settings.couchbase_fts_index_name,
            int(self.test_config.access_settings.items))

    def create_functions(self):
        logger.info('Creating n1ql functions')

        for statement in self.test_config.n1ql_function_settings.statements:
            self.rest.exec_n1ql_statement(self.query_nodes[0], statement)

    def sleep(self):
        access_settings = self.test_config.access_settings
        logger.info('Running phase for {} seconds'.format(
            access_settings.time))
        time.sleep(access_settings.time)

    def run_phase(self,
                  phase: str,
                  task: Callable,
                  settings: PhaseSettings,
                  target_iterator: Iterable,
                  timer: int = None,
                  wait: bool = True):
        logger.info('Running {}: {}'.format(phase, pretty_dict(settings)))
        self.worker_manager.run_tasks(task, settings, target_iterator, timer)
        if wait:
            self.worker_manager.wait_for_workers()

    def load(self,
             task: Callable = spring_task,
             settings: PhaseSettings = None,
             target_iterator: Iterable = None):
        if settings is None:
            settings = self.test_config.load_settings
        if target_iterator is None:
            target_iterator = self.target_iterator

        self.run_phase('load phase', task, settings, target_iterator)

    def hot_load(self, task: Callable = spring_task):
        settings = self.test_config.hot_load_settings

        self.run_phase('hot load phase', task, settings, self.target_iterator)

    def xattr_load(self,
                   task: Callable = spring_task,
                   target_iterator: Iterable = None):
        if target_iterator is None:
            target_iterator = self.target_iterator
        settings = self.test_config.xattr_load_settings

        self.run_phase('xattr phase', task, settings, target_iterator)

    def access(self,
               task: Callable = spring_task,
               settings: PhaseSettings = None,
               target_iterator: Iterable = None):
        if settings is None:
            settings = self.test_config.access_settings
        if target_iterator is None:
            target_iterator = self.target_iterator

        self.run_phase('access phase',
                       task,
                       settings,
                       target_iterator,
                       timer=settings.time)

    def access_bg(self,
                  task: Callable = spring_task,
                  settings: PhaseSettings = None,
                  target_iterator: Iterable = None):
        if settings is None:
            settings = self.test_config.access_settings
        if target_iterator is None:
            target_iterator = self.target_iterator

        self.run_phase('background access phase',
                       task,
                       settings,
                       target_iterator,
                       timer=settings.time,
                       wait=False)

    def report_kpi(self, *args, **kwargs):
        if self.test_config.stats_settings.enabled:
            self._report_kpi(*args, **kwargs)

    def _report_kpi(self, *args, **kwargs):
        pass

    def _measure_curr_ops(self) -> int:
        ops = 0
        for bucket in self.test_config.buckets:
            for server in self.rest.get_active_nodes_by_role(
                    self.master_node, "kv"):
                port = self.rest.get_memcached_port(server)

                stats = self.memcached.get_stats(server, port, bucket)
                for stat in b'cmd_get', b'cmd_set':
                    ops += int(stats[stat])
        return ops
Example #58
0
class ClusterManager(object):

    def __init__(self, cluster_spec, test_config, verbose):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec, test_config, verbose)
        self.monitor = Monitor(cluster_spec)
        self.memcached = MemcachedHelper(test_config)

        self.clusters = cluster_spec.yield_clusters
        self.servers = cluster_spec.yield_servers
        self.masters = cluster_spec.yield_masters

        self.initial_nodes = test_config.cluster.initial_nodes
        self.mem_quota = test_config.cluster.mem_quota
        self.index_mem_quota = test_config.cluster.index_mem_quota
        self.fts_index_mem_quota = test_config.cluster.fts_index_mem_quota
        self.group_number = test_config.cluster.group_number or 1
        self.roles = cluster_spec.roles

    def set_data_path(self):
        if self.cluster_spec.paths:
            data_path, index_path = self.cluster_spec.paths
            for server in self.servers():
                self.rest.set_data_path(server, data_path, index_path)

    def set_auth(self):
        for server in self.servers():
            self.rest.set_auth(server)

    def set_mem_quota(self):
        for server in self.servers():
            self.rest.set_mem_quota(server, self.mem_quota)

    def set_index_mem_quota(self):
        for server in self.servers():
            self.rest.set_index_mem_quota(server, self.index_mem_quota)

    def set_fts_index_mem_quota(self):
        for server in self.servers():
            self.rest.set_fts_index_mem_quota(server, self.fts_index_mem_quota)

    def set_query_settings(self):
        settings = self.test_config.n1ql_settings.settings
        for _, servers in self.cluster_spec.yield_servers_by_role('n1ql'):
            for server in servers:
                self.rest.set_query_settings(server, settings)

    def set_index_settings(self):
        if self.test_config.secondaryindex_settings.db != 'memdb':
            settings = self.test_config.secondaryindex_settings.settings
            for _, servers in self.cluster_spec.yield_servers_by_role('index'):
                for server in servers:
                    self.rest.set_index_settings(server, settings)
            self.remote.restart()
            time.sleep(60)
        else:
            logger.info("DB type is memdb. Not setting the indexer settings to take the default settings")

    def set_services(self):
        for (_, servers), initial_nodes in zip(self.clusters(),
                                               self.initial_nodes):
            master = servers[0]
            self.rest.set_services(master, self.roles[master])

    def disable_moxi(self):
        if self.test_config.cluster.disable_moxi is not None:
            self.remote.disable_moxi()

    def create_server_groups(self):
        for master in self.masters():
            for i in range(1, self.group_number):
                name = 'Group {}'.format(i + 1)
                self.rest.create_server_group(master, name=name)

    def add_nodes(self):
        for (_, servers), initial_nodes in zip(self.clusters(),
                                               self.initial_nodes):

            if initial_nodes < 2:  # Single-node cluster
                continue

            # Adding initial nodes
            master = servers[0]
            if self.group_number > 1:
                groups = self.rest.get_server_groups(master)
            else:
                groups = {}
            for i, host_port in enumerate(servers[1:initial_nodes],
                                          start=1):
                uri = groups.get(server_group(servers[:initial_nodes],
                                              self.group_number, i))
                self.rest.add_node(master, host_port, self.roles[host_port],
                                   uri)

            # Rebalance
            master = servers[0]
            known_nodes = servers[:initial_nodes]
            ejected_nodes = []
            self.rest.rebalance(master, known_nodes, ejected_nodes)
            self.monitor.monitor_rebalance(master)

    def create_buckets(self, empty_buckets=False):
        ram_quota = self.mem_quota / (self.test_config.cluster.num_buckets +
                                      self.test_config.cluster.emptybuckets)
        replica_number = self.test_config.bucket.replica_number
        replica_index = self.test_config.bucket.replica_index
        eviction_policy = self.test_config.bucket.eviction_policy
        threads_number = self.test_config.bucket.threads_number
        proxy_port = self.test_config.bucket.proxy_port
        password = self.test_config.bucket.password
        buckets = self.test_config.emptybuckets if empty_buckets else self.test_config.buckets

        for master in self.masters():
            for bucket_name in buckets:
                self.rest.create_bucket(host_port=master,
                                        name=bucket_name,
                                        ram_quota=ram_quota,
                                        replica_number=replica_number,
                                        replica_index=replica_index,
                                        eviction_policy=eviction_policy,
                                        threads_number=threads_number,
                                        password=password,
                                        proxy_port=proxy_port)

    def configure_auto_compaction(self):
        compaction_settings = self.test_config.compaction
        for master in self.masters():
            self.rest.configure_auto_compaction(master, compaction_settings)

    def configure_internal_settings(self):
        internal_settings = self.test_config.internal_settings
        for master in self.masters():
            for parameter, value in internal_settings.items():
                self.rest.set_internal_settings(master,
                                                {parameter: int(value)})

    def configure_xdcr_settings(self):
        xdcr_cluster_settings = self.test_config.xdcr_cluster_settings
        for master in self.masters():
            for parameter, value in xdcr_cluster_settings.items():
                self.rest.set_xdcr_cluster_settings(master,
                                                    {parameter: int(value)})

    def tweak_memory(self):
        self.remote.reset_swap()
        self.remote.drop_caches()
        self.remote.set_swappiness()
        self.remote.disable_thp()

    def restart_with_alternative_num_vbuckets(self):
        num_vbuckets = self.test_config.cluster.num_vbuckets
        if num_vbuckets is not None:
            self.remote.restart_with_alternative_num_vbuckets(num_vbuckets)

    def restart_with_alternative_bucket_options(self):
        cmd = 'ns_bucket:update_bucket_props("{}", ' \
              '[{{extra_config_string, "{}={}"}}]).'

        for option in ('defragmenter_enabled',
                       'exp_pager_stime',
                       'ht_locks',
                       'max_num_shards',
                       'max_threads',
                       'warmup_min_memory_threshold',
                       'bfilter_enabled'):
            value = getattr(self.test_config.bucket, option)
            if value != -1 and value is not None:
                logger.info('Changing {} to {}'.format(option, value))
                for master in self.masters():
                    for bucket in self.test_config.buckets:
                        diag_eval = cmd.format(bucket, option, value)
                        self.rest.run_diag_eval(master, diag_eval)
                self.remote.restart()

    def tune_logging(self):
        self.remote.tune_log_rotation()
        self.remote.restart()

    def restart_with_alternative_num_cpus(self):
        num_cpus = self.test_config.cluster.num_cpus
        if num_cpus:
            self.remote.restart_with_alternative_num_cpus(num_cpus)

    def restart_with_tcmalloc_aggressive_decommit(self):
        if self.test_config.cluster.tcmalloc_aggressive_decommit:
            self.remote.restart_with_tcmalloc_aggressive_decommit()

    def restart_with_sfwi(self):
        if self.test_config.cluster.sfwi:
            self.remote.restart_with_sfwi()

    def enable_auto_failover(self):
        for master in self.masters():
            self.rest.enable_auto_failover(master)

    def wait_until_warmed_up(self):
        target_iterator = TargetIterator(self.cluster_spec, self.test_config)
        for target in target_iterator:
            self.monitor.monitor_warmup(self.memcached, target.node,
                                        target.bucket)

    def wait_until_healthy(self):
        for master in self.cluster_spec.yield_masters():
            self.monitor.monitor_node_health(master)

    def change_watermarks(self):
        watermark_settings = self.test_config.watermark_settings
        for host_port, initial_nodes in zip(self.servers(),
                                            self.initial_nodes):
            host = host_port.split(':')[0]
            memcached_port = 11210
            for bucket in self.test_config.buckets:
                for key, val in watermark_settings.items():
                    val = self.memcached.calc_watermark(val, self.mem_quota)
                    self.memcached.set_flusher_param(host, memcached_port,
                                                     bucket, key, val)

    def start_cbq_engine(self):
        if self.test_config.cluster.run_cbq:
            self.remote.start_cbq()

    def change_dcp_io_threads(self):
        if self.test_config.secondaryindex_settings.db == 'memdb':
            self.remote.set_dcp_io_threads()
            time.sleep(30)
            self.remote.restart()
Example #59
0
class Monitor(RestHelper):

    MAX_RETRY = 60
    MAX_RETRY_RECOVERY = 1200
    MAX_RETRY_TIMER_EVENT = 18000
    MAX_RETRY_BOOTSTRAP = 1200

    MONITORING_DELAY = 5

    POLLING_INTERVAL = 2
    POLLING_INTERVAL_INDEXING = 1
    POLLING_INTERVAL_MACHINE_UP = 10
    POLLING_INTERVAL_ANALYTICS = 15
    POLLING_INTERVAL_EVENTING = 1

    REBALANCE_TIMEOUT = 3600 * 6
    TIMEOUT = 3600 * 12

    DISK_QUEUES = (
        'ep_queue_size',
        'ep_flusher_todo',
        'ep_diskqueue_items',
        'vb_active_queue_size',
        'vb_replica_queue_size',
    )

    DCP_QUEUES = (
        'ep_dcp_replica_items_remaining',
        'ep_dcp_other_items_remaining',
    )

    XDCR_QUEUES = ('replication_changes_left', )

    def __init__(self, cluster_spec, test_config, verbose):
        super().__init__(cluster_spec=cluster_spec)
        self.cluster_spec = cluster_spec
        self.test_config = test_config
        self.remote = RemoteHelper(cluster_spec, verbose)

    def monitor_rebalance(self, host):
        logger.info('Monitoring rebalance status')

        is_running = True
        last_progress = 0
        last_progress_time = time.time()
        while is_running:
            time.sleep(self.POLLING_INTERVAL)

            is_running, progress = self.get_task_status(host,
                                                        task_type='rebalance')
            if progress == last_progress:
                if time.time() - last_progress_time > self.REBALANCE_TIMEOUT:
                    logger.error('Rebalance hung')
                    break
            else:
                last_progress = progress
                last_progress_time = time.time()

            if progress is not None:
                logger.info('Rebalance progress: {} %'.format(progress))

        logger.info('Rebalance completed')

    def _wait_for_empty_queues(self, host, bucket, queues, stats_function):
        metrics = list(queues)

        start_time = time.time()
        while metrics:
            bucket_stats = stats_function(host, bucket)
            # As we are changing metrics in the loop; take a copy of it to
            # iterate over.
            for metric in list(metrics):
                stats = bucket_stats['op']['samples'].get(metric)
                if stats:
                    last_value = stats[-1]
                    if last_value:
                        logger.info('{} = {:,}'.format(metric, last_value))
                        continue
                    else:
                        logger.info('{} reached 0'.format(metric))
                    metrics.remove(metric)
            if metrics:
                time.sleep(self.POLLING_INTERVAL)
            if time.time() - start_time > self.TIMEOUT:
                raise Exception('Monitoring got stuck')

    def monitor_disk_queues(self, host, bucket):
        logger.info('Monitoring disk queues: {}'.format(bucket))
        self._wait_for_empty_queues(host, bucket, self.DISK_QUEUES,
                                    self.get_bucket_stats)

    def monitor_dcp_queues(self, host, bucket):
        logger.info('Monitoring DCP queues: {}'.format(bucket))
        self._wait_for_empty_queues(host, bucket, self.DCP_QUEUES,
                                    self.get_bucket_stats)

    def _wait_for_xdcr_to_start(self, host: str):
        is_running = False
        while not is_running:
            time.sleep(self.POLLING_INTERVAL)
            is_running, _ = self.get_task_status(host, task_type='xdcr')

    def monitor_xdcr_queues(self, host: str, bucket: str):
        logger.info('Monitoring XDCR queues: {}'.format(bucket))
        self._wait_for_xdcr_to_start(host)
        self._wait_for_empty_queues(host, bucket, self.XDCR_QUEUES,
                                    self.get_xdcr_stats)

    def _get_num_items(self,
                       host: str,
                       bucket: str,
                       total: bool = False) -> int:
        stats = self.get_bucket_stats(host=host, bucket=bucket)
        if total:
            curr_items = stats['op']['samples'].get('curr_items_tot')
        else:
            curr_items = stats['op']['samples'].get('curr_items')
        if curr_items:
            return curr_items[-1]
        return 0

    def monitor_num_items(self, host: str, bucket: str, num_items: int):
        logger.info('Checking the number of items in {}'.format(bucket))
        retries = 0
        while retries < self.MAX_RETRY:
            if self._get_num_items(host, bucket, total=True) == num_items:
                break
            time.sleep(self.POLLING_INTERVAL)
            retries += 1
        else:
            actual_items = self._get_num_items(host, bucket, total=True)
            raise Exception(
                'Mismatch in the number of items: {}'.format(actual_items))

    def monitor_task(self, host, task_type):
        logger.info('Monitoring task: {}'.format(task_type))
        time.sleep(self.MONITORING_DELAY)

        while True:
            time.sleep(self.POLLING_INTERVAL)

            tasks = [
                task for task in self.get_tasks(host)
                if task.get('type') == task_type
            ]
            if tasks:
                for task in tasks:
                    logger.info('{}: {}%, bucket: {}, ddoc: {}'.format(
                        task_type, task.get('progress'), task.get('bucket'),
                        task.get('designDocument')))
            else:
                break
        logger.info('Task {} successfully completed'.format(task_type))

    def monitor_warmup(self, memcached, host, bucket):
        logger.info('Monitoring warmup status: {}@{}'.format(bucket, host))

        memcached_port = self.get_memcached_port(host)

        while True:
            stats = memcached.get_stats(host, memcached_port, bucket, 'warmup')
            if b'ep_warmup_state' in stats:
                state = stats[b'ep_warmup_state']
                if state == b'done':
                    return float(stats.get(b'ep_warmup_time', 0))
                else:
                    logger.info('Warmpup status: {}'.format(state))
                    time.sleep(self.POLLING_INTERVAL)
            else:
                logger.info('No warmup stats are available, continue polling')
                time.sleep(self.POLLING_INTERVAL)

    def monitor_compression(self, memcached, host, bucket):
        logger.info('Monitoring active compression status')

        memcached_port = self.get_memcached_port(host)

        json_docs = -1
        while json_docs:
            stats = memcached.get_stats(host, memcached_port, bucket)
            json_docs = int(stats[b'ep_active_datatype_json'])
            if json_docs:
                logger.info('Still uncompressed: {:,} items'.format(json_docs))
                time.sleep(self.POLLING_INTERVAL)
        logger.info('All items are compressed')

    def monitor_node_health(self, host):
        logger.info('Monitoring node health')

        for retry in range(self.MAX_RETRY):
            unhealthy_nodes = {
                n
                for n, status in self.node_statuses(host).items()
                if status != 'healthy'
            } | {
                n
                for n, status in self.node_statuses_v2(host).items()
                if status != 'healthy'
            }
            if unhealthy_nodes:
                time.sleep(self.POLLING_INTERVAL)
            else:
                break
        else:
            logger.interrupt(
                'Some nodes are not healthy: {}'.format(unhealthy_nodes))

    def monitor_analytics_node_active(self, host):
        logger.info('Monitoring analytics node health')

        for retry in range(self.MAX_RETRY):
            active = self.analytics_node_active(host)
            if active:
                break
            else:
                time.sleep(self.POLLING_INTERVAL)
        else:
            logger.interrupt(
                'Analytics node still not healthy: {}'.format(host))

    def is_index_ready(self, host: str) -> bool:
        for status in self.get_index_status(host)['status']:
            if status['status'] != 'Ready':
                return False
        return True

    def estimate_pending_docs(self, host: str) -> int:
        stats = self.get_gsi_stats(host)
        pending_docs = 0
        for metric, value in stats.items():
            if 'num_docs_queued' in metric or 'num_docs_pending' in metric:
                pending_docs += value
        return pending_docs

    def monitor_indexing(self, host):
        logger.info('Monitoring indexing progress')

        while not self.is_index_ready(host):
            time.sleep(self.POLLING_INTERVAL_INDEXING * 5)
            pending_docs = self.estimate_pending_docs(host)
            logger.info('Pending docs: {:,}'.format(pending_docs))

        logger.info('Indexing completed')

    def wait_for_secindex_init_build(self, host, indexes):
        # POLL until initial index build is complete
        logger.info("Waiting for the following indexes to be ready: {}".format(
            indexes))

        indexes_ready = [0 for _ in indexes]

        def get_index_status(json2i, index):
            """Return the index status."""
            for d in json2i["status"]:
                if d["name"] == index:
                    return d["status"]
            return None

        @misc.retry(catch=(KeyError, ), iterations=10, wait=30)
        def update_indexes_ready():
            json2i = self.get_index_status(host)
            for i, index in enumerate(indexes):
                status = get_index_status(json2i, index)
                if status == 'Ready':
                    indexes_ready[i] = 1

        init_ts = time.time()
        while sum(indexes_ready) != len(indexes):
            time.sleep(self.POLLING_INTERVAL_INDEXING)
            update_indexes_ready()
        finish_ts = time.time()
        logger.info('secondary index build time: {}'.format(finish_ts -
                                                            init_ts))
        time_elapsed = round(finish_ts - init_ts)
        return time_elapsed

    def wait_for_secindex_incr_build(self, index_nodes, bucket, indexes,
                                     numitems):
        # POLL until incremenal index build is complete
        logger.info('expecting {} num_docs_indexed for indexes {}'.format(
            numitems, indexes))

        # collect num_docs_indexed information globally from all index nodes
        def get_num_docs_indexed():
            data = self.get_index_stats(index_nodes)
            num_indexed = []
            for index in indexes:
                key = "" + bucket + ":" + index + ":num_docs_indexed"
                val = data[key]
                num_indexed.append(val)
            return num_indexed

        def get_num_docs_index_pending():
            data = self.get_index_stats(index_nodes)
            num_pending = []
            for index in indexes:
                key = "" + bucket + ":" + index + ":num_docs_pending"
                val1 = data[key]
                key = "" + bucket + ":" + index + ":num_docs_queued"
                val2 = data[key]
                val = int(val1) + int(val2)
                num_pending.append(val)
            return num_pending

        expected_num_pending = [0] * len(indexes)
        while True:
            time.sleep(self.POLLING_INTERVAL_INDEXING)
            curr_num_pending = get_num_docs_index_pending()
            if curr_num_pending == expected_num_pending:
                break
        curr_num_indexed = get_num_docs_indexed()
        logger.info("Number of Items indexed {}".format(curr_num_indexed))

    def wait_for_num_connections(self, index_node, expected_connections):
        curr_connections = self.get_index_num_connections(index_node)
        retry = 1
        while curr_connections < expected_connections and retry < self.MAX_RETRY:
            time.sleep(self.POLLING_INTERVAL_INDEXING)
            curr_connections = self.get_index_num_connections(index_node)
            logger.info("Got current connections {}".format(curr_connections))
            retry += 1
        if retry == self.MAX_RETRY:
            return False
        return True

    def wait_for_recovery(self, index_nodes, bucket, index):
        time.sleep(self.MONITORING_DELAY)
        for retry in range(self.MAX_RETRY_RECOVERY):
            response = self.get_index_stats(index_nodes)
            item = "{}:{}:disk_load_duration".format(bucket, index)
            if item in response:
                return response[item]
            else:
                time.sleep(self.POLLING_INTERVAL)
        return -1

    def wait_for_servers(self):
        for retry in range(self.MAX_RETRY):
            logger.info('Waiting for all servers to be available')
            time.sleep(self.POLLING_INTERVAL_MACHINE_UP)

            for server in self.cluster_spec.servers:
                if not self.remote.is_up(server):
                    break
            else:
                logger.info('All nodes are up')
                return

        logger.interrupt('Some nodes are still down')

    def monitor_fts_indexing_queue(self, host: str, index: str, items: int):
        logger.info('Waiting for indexing to finish')
        count = 0
        while count < items:
            count = self.get_fts_doc_count(host, index)
            logger.info('FTS indexed documents: {:,}'.format(count))
            time.sleep(self.POLLING_INTERVAL)

    def monitor_fts_index_persistence(self, hosts: list, index: str):
        logger.info('Waiting for index to be persisted')
        pending_items = 1
        while pending_items:
            persist = 0
            compact = 0
            for host in hosts:
                stats = self.get_fts_stats(host)

                metric = '{}:{}:{}'.format(self.test_config.buckets[0], index,
                                           'num_recs_to_persist')
                persist += stats[metric]

                metric = '{}:{}:{}'.format(self.test_config.buckets[0], index,
                                           'total_compactions')
                compact += stats[metric]

            pending_items = persist or compact
            logger.info('Records to persist: {:,}'.format(persist))
            logger.info('Ongoing compactions: {:,}'.format(compact))
            time.sleep(self.POLLING_INTERVAL)

    def monitor_elastic_indexing_queue(self, host: str, index: str):
        logger.info(' Waiting for indexing to finish')
        items = int(self.test_config.fts_settings.test_total_docs)
        count = 0
        while count < items:
            count = self.get_elastic_doc_count(host, index)
            logger.info('Elasticsearch indexed documents: {:,}'.format(count))
            time.sleep(self.POLLING_INTERVAL)

    def monitor_elastic_index_persistence(self, host: str, index: str):
        logger.info('Waiting for index to be persisted')

        pending_items = -1
        while pending_items:
            stats = self.get_elastic_stats(host)
            pending_items = stats['indices'][index]['total']['translog'][
                'operations']
            logger.info('Records to persist: {:,}'.format(pending_items))
            time.sleep(self.POLLING_INTERVAL)

    def wait_for_bootstrap(self, nodes: list, function: str):
        logger.info(
            'Waiting for bootstrap of eventing function: {} '.format(function))
        for node in nodes:
            retry = 1
            while retry < self.MAX_RETRY_BOOTSTRAP:
                if function in self.get_deployed_apps(node):
                    break
                time.sleep(self.POLLING_INTERVAL)
                retry += 1
            if retry == self.MAX_RETRY_BOOTSTRAP:
                logger.info(
                    'Failed to bootstrap function: {}, node: {}'.format(
                        function, node))

    def get_num_analytics_items(self, data_node: str, bucket: str) -> int:
        stats_key = '{}:all:incoming_records_count_total'.format(bucket)
        num_items = 0
        for node in self.get_active_nodes_by_role(data_node, 'cbas'):
            stats = self.get_analytics_stats(node)
            num_items += stats[stats_key]
        return num_items

    def monitor_data_synced(self, data_node: str, bucket: str) -> int:
        logger.info('Waiting for data to be synced from {}'.format(data_node))

        num_items = self._get_num_items(data_node, bucket)

        while True:
            num_analytics_items = self.get_num_analytics_items(
                data_node, bucket)
            if num_analytics_items == num_items:
                break
            logger.info('Analytics has {:,} docs (target is {:,})'.format(
                num_analytics_items, num_items))
            time.sleep(self.POLLING_INTERVAL_ANALYTICS)

        return num_items

    def wait_for_timer_event(self,
                             node: str,
                             function: str,
                             event="DOC_TIMER_EVENTS"):
        logger.info('Waiting for timer events to start processing: {} '.format(
            function))
        retry = 1
        while retry < self.MAX_RETRY_TIMER_EVENT:
            if 0 < self.get_num_events_processed(
                    event=event, node=node, name=function):
                break
            time.sleep(self.POLLING_INTERVAL_EVENTING)
            retry += 1
        if retry == self.MAX_RETRY_TIMER_EVENT:
            logger.info(
                'Failed to get timer event for function: {}'.format(function))

    def wait_for_all_mutations_processed(self, host: str, bucket1: str,
                                         bucket2: str):
        logger.info(
            'Waiting for mutations to be processed of eventing function')
        retry = 1
        while retry < self.MAX_RETRY_BOOTSTRAP:
            if self._get_num_items(host=host, bucket=bucket1) == \
                    self._get_num_items(host=host, bucket=bucket2):
                break
            retry += 1
            time.sleep(self.POLLING_INTERVAL_EVENTING)
        if retry == self.MAX_RETRY_BOOTSTRAP:
            logger.info('Failed to process all mutations... TIMEOUT')
Example #60
0
class ClusterManager(object):
    def __init__(self, cluster_spec, test_config):
        self.cluster_spec = cluster_spec
        self.test_config = test_config

        self.rest = RestHelper(cluster_spec)
        self.remote = RemoteHelper(cluster_spec)
        self.monitor = Monitor(cluster_spec)
        self.memcached = MemcachedHelper(cluster_spec)

        self.clusters = cluster_spec.yield_clusters()
        self.servers = cluster_spec.yield_servers
        self.masters = cluster_spec.yield_masters
        self.hostnames = cluster_spec.yield_hostnames

        self.initial_nodes = test_config.cluster.initial_nodes
        self.mem_quota = test_config.cluster.mem_quota
        self.group_number = test_config.cluster.group_number or 1

    def set_data_path(self):
        data_path, index_path = self.cluster_spec.paths
        for server in self.servers():
            self.rest.set_data_path(server, data_path, index_path)

    def set_auth(self):
        for server in self.servers():
            self.rest.set_auth(server)

    def set_mem_quota(self):
        for server in self.servers():
            self.rest.set_mem_quota(server, self.mem_quota)

    def disable_moxi(self):
        if self.test_config.cluster.disable_moxi is not None:
            self.remote.disable_moxi()

    def create_server_groups(self):
        for master in self.masters():
            for i in range(1, self.group_number):
                name = 'Group {}'.format(i + 1)
                self.rest.create_server_group(master, name=name)

    def add_nodes(self):
        for (_, servers), initial_nodes in zip(self.clusters,
                                               self.initial_nodes):
            if initial_nodes < 2:  # Single-node cluster
                continue

            # Adding initial nodes
            master = servers[0]
            if self.group_number > 1:
                groups = self.rest.get_server_groups(master)
            else:
                groups = {}
            for i, host_port in enumerate(servers[1:initial_nodes], start=1):
                host = host_port.split(':')[0]
                uri = groups.get(
                    server_group(servers[:initial_nodes], self.group_number,
                                 i))
                self.rest.add_node(master, host, uri)

            # Rebalance
            master = servers[0]
            known_nodes = servers[:initial_nodes]
            ejected_nodes = []
            self.rest.rebalance(master, known_nodes, ejected_nodes)
            self.monitor.monitor_rebalance(master)

    def create_buckets(self):
        ram_quota = self.mem_quota / self.test_config.cluster.num_buckets
        replica_number = self.test_config.bucket.replica_number
        replica_index = self.test_config.bucket.replica_index
        eviction_policy = self.test_config.bucket.eviction_policy
        threads_number = self.test_config.bucket.threads_number

        for master in self.masters():
            for bucket_name in self.test_config.buckets:
                self.rest.create_bucket(
                    host_port=master,
                    name=bucket_name,
                    ram_quota=ram_quota,
                    replica_number=replica_number,
                    replica_index=replica_index,
                    eviction_policy=eviction_policy,
                    threads_number=threads_number,
                )

    def configure_auto_compaction(self):
        compaction_settings = self.test_config.compaction
        for master in self.masters():
            self.rest.configure_auto_compaction(master, compaction_settings)

    def configure_internal_settings(self):
        internal_settings = self.test_config.internal_settings
        for master in self.masters():
            for parameter, value in internal_settings.items():
                self.rest.set_internal_settings(master,
                                                {parameter: int(value)})

    def tweak_memory(self):
        self.remote.reset_swap()
        self.remote.drop_caches()
        self.remote.set_swappiness()
        self.remote.disable_thp()

    def restart_with_alternative_num_vbuckets(self):
        num_vbuckets = self.test_config.cluster.num_vbuckets
        if num_vbuckets is not None:
            self.remote.restart_with_alternative_num_vbuckets(num_vbuckets)

    def restart_with_alternative_bucket_options(self):
        cmd = 'ns_bucket:update_bucket_props("{}", ' \
              '[{{extra_config_string, "{}={}"}}]).'

        for option in ('max_num_shards', 'max_threads'):
            value = getattr(self.test_config.bucket, option)
            if value:
                logger.info('Changing {} to {}'.format(option, value))
                for master in self.masters():
                    for bucket in self.test_config.buckets:
                        diag_eval = cmd.format(bucket, option, value)
                        self.rest.run_diag_eval(master, diag_eval)
                self.remote.restart()

    def restart_with_alternative_num_cpus(self):
        num_cpus = self.test_config.cluster.num_cpus
        if num_cpus:
            self.remote.restart_with_alternative_num_cpus(num_cpus)

    def enable_auto_failover(self):
        for master in self.masters():
            self.rest.enable_auto_failover(master)

    def wait_until_warmed_up(self):
        target_iterator = TargetIterator(self.cluster_spec, self.test_config)
        for target in target_iterator:
            host = target.node.split(':')[0]
            self.monitor.monitor_warmup(self.memcached, host, target.bucket)

    def change_watermarks(self):
        watermark_settings = self.test_config.watermark_settings
        for hostname, initial_nodes in zip(self.hostnames(),
                                           self.initial_nodes):
            for bucket in self.test_config.buckets:
                for key, val in watermark_settings.items():
                    val = self.memcached.calc_watermark(val, self.mem_quota)
                    self.memcached.set_flusher_param(hostname, bucket, key,
                                                     val)

    def start_cbq_engine(self):
        if self.test_config.cluster.run_cbq:
            self.remote.start_cbq()