def setitup(self):
        # if user forget to assign the number of initial nodes for any cluster
        # use 1 node as default
        if len(self._num_initial_nodes) < len(self._clusters_keys_olst):
            diff = len(self._clusters_keys_olst) - len(self._num_initial_nodes)
            for i in range(diff):
                self._num_initial_nodes.append('1')

        for key in self._clusters_keys_olst:
            clusterStatus = None
            if key == 0:
                clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or ClusterStatus()
            else:
                clusterStatus = CacheHelper.clusterstatus(cfg.CB_REMOTE_CLUSTER_TAG[key-1]+"_status") or\
                    ClusterStatus(cfg.CB_REMOTE_CLUSTER_TAG[key-1]+"_status")

            clusterStatus.all_available_hosts = ["%s:%s" % (node.ip, node.port) for node in self._clusters_dic[key]]

            self.set_the_cluster_up(self._clusters_dic[key][:int(self._num_initial_nodes[key])])

        time.sleep(20)

        if self._xdcr:
            self._link_create_replications(self._s_master, self._d_master, "cluster1")
            if self._rdirection == "bidirection":
                self._link_create_replications(self._d_master, self._s_master, "cluster0")
Beispiel #2
0
    def setitup(self):
        # if user forget to assign the number of initial nodes for any cluster
        # use 1 node as default
        if len(self._num_initial_nodes) < len(self._clusters_keys_olst):
            diff = len(self._clusters_keys_olst) - len(self._num_initial_nodes)
            for i in range(diff):
                self._num_initial_nodes.append('1')

        for key in self._clusters_keys_olst:
            clusterStatus = None
            if key == 0:
                clusterStatus = CacheHelper.clusterstatus(
                    cfg.CB_CLUSTER_TAG + "_status") or ClusterStatus()
            else:
                clusterStatus = CacheHelper.clusterstatus(cfg.CB_REMOTE_CLUSTER_TAG[key-1]+"_status") or\
                    ClusterStatus(cfg.CB_REMOTE_CLUSTER_TAG[key-1]+"_status")

            clusterStatus.all_available_hosts = [
                "%s:%s" % (node.ip, node.port)
                for node in self._clusters_dic[key]
            ]

            self.set_the_cluster_up(
                self._clusters_dic[key][:int(self._num_initial_nodes[key])])

        time.sleep(20)

        if self._xdcr:
            self._link_create_replications(self._s_master, self._d_master,
                                           "cluster1")
            if self._rdirection == "bidirection":
                self._link_create_replications(self._d_master, self._s_master,
                                               "cluster0")
def perform_admin_tasks(adminMsg, cluster_id=cfg.CB_CLUSTER_TAG + "_status"):
    app.workload_manager.updateClusterStatus()
    clusterStatus = CacheHelper.clusterstatus(cluster_id)
    if clusterStatus is None:
        logger.error("Unable to fetch clusterStatus from cache")
        return

    rest = clusterStatus.node_rest()

    # Add nodes
    servers = adminMsg["rebalance_in"]
    add_nodes(rest, servers, cluster_id)

    # Get all nodes
    allNodes = []
    for node in rest.node_statuses():
        allNodes.append(node.id)

    # Remove nodes
    servers = adminMsg["rebalance_out"]
    toBeEjectedNodes = remove_nodes(rest, servers, adminMsg["involve_orchestrator"], cluster_id)

    # Failover Node
    servers = adminMsg["failover"]
    auto_failover_servers = adminMsg["auto_failover"]
    only_failover = adminMsg["only_failover"]
    add_back_servers = adminMsg["add_back"]
    failoverNodes = failover_nodes(rest, servers, only_failover, adminMsg["involve_orchestrator"], cluster_id)
    autoFailoverNodes = auto_failover_nodes(
        rest, auto_failover_servers, only_failover, adminMsg["involve_orchestrator"], cluster_id
    )

    app.workload_manager.updateClusterStatus()
    clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id)
    rest = clusterStatus.node_rest()
    addBackNodes = add_back_nodes(rest, add_back_servers, autoFailoverNodes + failoverNodes)
    toBeEjectedNodes.extend(failoverNodes)
    toBeEjectedNodes.extend(autoFailoverNodes)
    for node in addBackNodes:
        toBeEjectedNodes.remove(node)

    # SoftRestart a node
    servers = adminMsg["soft_restart"]
    restart(servers, cluster_id=cluster_id)

    # HardRestart a node
    servers = adminMsg["hard_restart"]
    restart(servers, type="hard", cluster_id=cluster_id)

    if not only_failover and (len(allNodes) > 0 or len(toBeEjectedNodes) > 0):
        logger.error("Rebalance")
        logger.error(allNodes)
        logger.error(toBeEjectedNodes)
        rest.rebalance(otpNodes=allNodes, ejectedNodes=toBeEjectedNodes)
def perform_admin_tasks(adminMsg, cluster_id=cfg.CB_CLUSTER_TAG+"_status"):
    app.workload_manager.updateClusterStatus()
    clusterStatus = CacheHelper.clusterstatus(cluster_id)
    if clusterStatus is None:
        logger.error("Unable to fetch clusterStatus from cache")
        return

    rest = clusterStatus.node_rest()

    # Add nodes
    servers = adminMsg["rebalance_in"]
    add_nodes(rest, servers, cluster_id)

    # Get all nodes
    allNodes = []
    for node in rest.node_statuses():
        allNodes.append(node.id)

    # Remove nodes
    servers = adminMsg["rebalance_out"]
    toBeEjectedNodes  = remove_nodes(rest, servers, adminMsg["involve_orchestrator"], cluster_id)

    # Failover Node
    servers = adminMsg["failover"]
    auto_failover_servers = adminMsg["auto_failover"]
    only_failover = adminMsg["only_failover"]
    add_back_servers = adminMsg["add_back"]
    failoverNodes = failover_nodes(rest, servers, only_failover, adminMsg["involve_orchestrator"], cluster_id)
    autoFailoverNodes = auto_failover_nodes(rest, auto_failover_servers, only_failover, adminMsg["involve_orchestrator"], cluster_id)

    app.workload_manager.updateClusterStatus()
    clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id)
    rest = clusterStatus.node_rest()
    addBackNodes = add_back_nodes(rest, add_back_servers, autoFailoverNodes+failoverNodes)
    toBeEjectedNodes.extend(failoverNodes)
    toBeEjectedNodes.extend(autoFailoverNodes)
    for node in addBackNodes:
        toBeEjectedNodes.remove(node)

    # SoftRestart a node
    servers = adminMsg["soft_restart"]
    restart(servers, cluster_id=cluster_id)

    # HardRestart a node
    servers = adminMsg["hard_restart"]
    restart(servers, type='hard', cluster_id=cluster_id)

    if not only_failover and (len(allNodes) > 0 or len(toBeEjectedNodes) > 0):
        logger.error("Rebalance")
        logger.error(allNodes)
        logger.error(toBeEjectedNodes)
        rest.rebalance(otpNodes=allNodes, ejectedNodes=toBeEjectedNodes)
Beispiel #5
0
def add_nodes(rest,
              servers='',
              cluster_id=cfg.CB_CLUSTER_TAG + "_status",
              zone_name='',
              services=None):
    # create zone if it does not exit
    if zone_name != '':
        if rest.is_zone_exist(zone_name) == False:
            rest.add_zone(zone_name)

    if servers.find('.') != -1 or servers == '':
        servers = servers.split()
    else:
        clusterStatus = CacheHelper.clusterstatus(cluster_id)
        count = int(servers)
        if (len(clusterStatus.all_available_hosts) -
                len(clusterStatus.nodes)) >= int(count):
            servers = list(
                set(clusterStatus.all_available_hosts) -
                set(clusterStatus.get_all_hosts()))
        else:
            logger.error(
                "Add nodes request invalid. # of nodes outside cluster is not enough"
            )
            return
        servers = servers[:count]
    for server in servers:
        logger.error("Adding node %s" % server)
        ip, port = parse_server_arg(server)
        if services:
            rest.add_node(cfg.COUCHBASE_USER, cfg.COUCHBASE_PWD, ip, port,
                          zone_name, services)
        else:
            rest.add_node(cfg.COUCHBASE_USER, cfg.COUCHBASE_PWD, ip, port,
                          zone_name)
Beispiel #6
0
def queryRunner():

    hosts = None
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status")

    if clusterStatus:
        hosts = clusterStatus.get_all_hosts()

    # retreive all active query workloads
    queries = CacheHelper.active_queries()
    for query in queries:

        # async update query workload object
        updateQueryWorkload.apply_async(args=[query])

        count = int(query.qps)
        filters = list(set(query.include_filters) -\
                       set(query.exclude_filters))
        params = generateQueryParams(query.indexed_key, query.bucket, filters,
                                     query.limit, query.startkey, query.endkey,
                                     query.startkey_docid, query.endkey_docid)
        multi_query.delay(count,
                          query.ddoc,
                          query.view,
                          params,
                          query.bucket,
                          query.password,
                          hosts=hosts)
def restart(servers='', type='soft', cluster_id=cfg.CB_CLUSTER_TAG+"_status"):
    if servers.find('.') != -1 or servers == '':
        servers = servers.split()
    else:
        clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id)
        count = int(servers)
        if len(clusterStatus.nodes) >= int(count):
            servers = clusterStatus.get_all_hosts()
        else:
            logger.error("Restart nodes request invalid. # of nodes in cluster is not enough")
            return
        servers = servers[:count]

    for server in servers:
        ip, port = parse_server_arg(server)
        node_ssh, node = create_ssh_conn(ip)
        if type is not 'soft':
            logger.error('Hard Restart')
            cmd = "reboot"
        else:
            logger.error('Soft Restart')
            cmd = "/etc/init.d/couchbase-server restart"

        logger.error(cmd)
        result = node_ssh.execute_command(cmd, node)
        logger.error(result)
Beispiel #8
0
def pick_nodesToRemove(servers='',
                       involve_orchestrator=False,
                       cluster_id=cfg.CB_CLUSTER_TAG + "_status"):
    if servers.find('.') != -1 or servers == '':
        servers = servers.split()
    else:
        clusterStatus = CacheHelper.clusterstatus(cluster_id)
        count = int(servers)
        temp_count = count
        servers = []
        if involve_orchestrator:
            servers.append("%s:%s" % (clusterStatus.orchestrator.ip,
                                      clusterStatus.orchestrator.port))
            temp_count = temp_count - 1

        if len(clusterStatus.nodes) > count:
            non_orchestrator_servers = list(
                set(clusterStatus.get_all_hosts()) - set([
                    "%s:%s" % (clusterStatus.orchestrator.ip,
                               clusterStatus.orchestrator.port)
                ]))
            servers.extend(non_orchestrator_servers[:temp_count])
        else:
            logger.error(
                "Remove nodes request invalid. # of nodes in cluster is not enough"
            )
            return []

    return servers
def restart(servers='', type='soft', cluster_id=cfg.CB_CLUSTER_TAG+"_status"):
    if servers.find('.') != -1 or servers == '':
        servers = servers.split()
    else:
        clusterStatus = CacheHelper.clusterstatus(cluster_id)
        count = int(servers)
        if len(clusterStatus.nodes) >= int(count):
            servers = clusterStatus.get_all_hosts()
        else:
            logger.error("Restart nodes request invalid. # of nodes in cluster is not enough")
            return
        servers = servers[:count]

    for server in servers:
        ip, port = parse_server_arg(server)
        node_ssh, node = create_ssh_conn(ip)
        if type is not 'soft':
            logger.error('Hard Restart')
            if cfg.COUCHBASE_OS == "windows":
                cmd = "shutdown -r -t 0"
            else:
                cmd = "reboot"
        else:
            logger.error('Soft Restart')
            if cfg.COUCHBASE_OS == "windows":
                cmd = "net stop couchbaseserver && net start couchbaseserver"
            else:
                cmd = "/etc/init.d/couchbase-server restart"

        logger.error(cmd)
        result = node_ssh.execute_command(cmd, node)
        logger.error(result)
Beispiel #10
0
def queryRunner():

    hosts = None
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status")

    if clusterStatus:
        hosts = clusterStatus.get_all_hosts()

    # retreive all active query workloads
    queries = CacheHelper.active_queries()
    for query in queries:

        # async update query workload object
        updateQueryWorkload.apply_async(args=[query])

        count = int(query.qps)
        filters = list(set(query.include_filters) -\
                       set(query.exclude_filters))
        params = generateQueryParams(query.indexed_key,
                                     query.bucket,
                                     filters,
                                     query.limit,
                                     query.startkey,
                                     query.endkey,
                                     query.startkey_docid,
                                     query.endkey_docid)
        multi_query.delay(count,
                          query.ddoc,
                          query.view,
                          params,
                          query.bucket,
                          query.password,
                          hosts = hosts)
def restart(servers="", type="soft", cluster_id=cfg.CB_CLUSTER_TAG + "_status"):
    if servers.find(".") != -1 or servers == "":
        servers = servers.split()
    else:
        clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id)
        count = int(servers)
        if len(clusterStatus.nodes) >= int(count):
            servers = clusterStatus.get_all_hosts()
        else:
            logger.error("Restart nodes request invalid. # of nodes in cluster is not enough")
            return
        servers = servers[:count]

    for server in servers:
        ip, port = parse_server_arg(server)
        node_ssh, node = create_ssh_conn(ip)
        if type is not "soft":
            logger.error("Hard Restart")
            cmd = "reboot"
        else:
            logger.error("Soft Restart")
            cmd = "/etc/init.d/couchbase-server restart"

        logger.error(cmd)
        result = node_ssh.execute_command(cmd, node)
        logger.error(result)
def report_kv_latency(bucket = "default"):

    if cfg.SERIESLY_IP == '':
        # seriesly not configured
        return

    rabbitHelper = report_kv_latency.rabbitHelper
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\
        ClusterStatus()

    host = clusterStatus.get_random_host()
    if host is None: return

    ip, port = host.split(':')

    workloads = CacheHelper.workloads()
    for workload in workloads:
        if workload.active and workload.bucket == bucket:

            # read workload params
            bucket = str(workload.bucket)
            password = str(workload.password)

            # read template from active workload
            template = Template.from_cache(str(workload.template))
            template = template.__dict__
            client.decodeMajgicStrings(template)

            # setup key/val to use for timing
            key = _random_string(12)
            value = json.dumps(template['kv'])
            get_key = key


            # for get op, try to pull from consume_queue
            # so that we can calc impact of dgm
            consume_queue = workload.consume_queue
            if consume_queue is not None:
                keys = rabbitHelper.getJsonMsg(str(consume_queue), requeue = True)
                if len(keys) > 0:
                    get_key = str(keys[0])

            # collect op latency
            set_latency = client.mc_op_latency('set', key, value, ip, port, bucket, password)
            get_latency = client.mc_op_latency('get', get_key, value, ip, port, bucket, password)
            delete_latency = client.mc_op_latency('delete', key, value, ip, port, bucket, password)


            # report to seriessly
            seriesly = Seriesly(cfg.SERIESLY_IP, 3133)
            db='fast'
            seriesly[db].append({'set_latency' : set_latency,
                                 'get_latency' : get_latency,
                                 'delete_latency' : delete_latency})
Beispiel #13
0
def getClusterStat(bucket, stat):

    val = 0
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\
        ClusterStatus()
    host = clusterStatus.get_random_host()
    stat_checker = phandler.BucketStatChecker(bucket, addr=host)
    stats = stat_checker.get_stats()
    if len(stats) > 0:
        if stat in stats:
            val = stats[stat]

    return val
def getClusterStat(bucket, stat):

    val = 0
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\
        ClusterStatus()
    host = clusterStatus.get_random_host()
    stat_checker = phandler.BucketStatChecker(bucket, addr = host)
    stats = stat_checker.get_stats()
    if len(stats) > 0:
        if stat in stats:
            val = stats[stat]

    return val
Beispiel #15
0
def get_ep_hostip_from_params(params):
    app.workload_manager.updateClusterStatus()
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status")
    random_host = None
    try:
        random_host = clusterStatus.get_random_host().split(":")[0]
    except AttributeError:
        logger.error("Can not fetch cluster status information")
        pass

    host = params.get('ip') or random_host or cfg.COUCHBASE_IP
    port = params.get('port') or 11210

    return host, int(port)
def get_ep_hostip_from_params(params):
    app.workload_manager.updateClusterStatus()
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status")
    random_host = None
    try:
        random_host = clusterStatus.get_random_host().split(":")[0]
    except AttributeError:
        logger.error("Can not fetch cluster status information")
        pass

    host = params.get('ip') or random_host or cfg.COUCHBASE_IP
    port = params.get('port') or 11210

    return host, int(port)
def queue_op_cycles(workload):


    # read doc template
    template = Template.from_cache(str(workload.template))
    if template is None:
        logger.error("no doc template imported")
        return

    rabbitHelper = queue_op_cycles.rabbitHelper
    bucket = str(workload.bucket)
    task_queue = workload.task_queue

    active_hosts = None
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status")
    if clusterStatus is not None:
        active_hosts = clusterStatus.get_all_hosts()

    # create 30 op cycles
    for i in xrange(20):

        if workload.cc_queues is not None:
            # override template attribute with workload
            template.cc_queues = workload.cc_queues

        if len(workload.indexed_keys) > 0:
            template.indexed_keys = workload.indexed_keys

        # read  workload settings
        bucketInfo = {"bucket" : workload.bucket,
                      "password" : workload.password}

        ops_sec = workload.ops_per_sec

        create_count = int(ops_sec *  workload.create_perc/100)
        update_count = int(ops_sec *  workload.update_perc/100)
        get_count = int(ops_sec *  workload.get_perc/100)
        del_count = int(ops_sec *  workload.del_perc/100)
        exp_count = int(ops_sec *  workload.exp_perc/100)
        consume_queue =  workload.consume_queue

        ttl = workload.ttl
        miss_queue = workload.miss_queue
        miss_perc = workload.miss_perc

        generate_pending_tasks(task_queue, template, bucketInfo, create_count,
                               update_count, get_count, del_count, exp_count,
                               consume_queue, ttl, miss_perc, miss_queue, active_hosts)
Beispiel #18
0
def queue_op_cycles(workload):

    # read doc template
    template = Template.from_cache(str(workload.template))
    if template is None:
        logger.error("no doc template imported")
        return

    rabbitHelper = queue_op_cycles.rabbitHelper
    bucket = str(workload.bucket)
    task_queue = workload.task_queue

    active_hosts = None
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status")
    if clusterStatus is not None:
        active_hosts = clusterStatus.get_all_hosts()

    # create 30 op cycles
    for i in xrange(20):

        if workload.cc_queues is not None:
            # override template attribute with workload
            template.cc_queues = workload.cc_queues

        if len(workload.indexed_keys) > 0:
            template.indexed_keys = workload.indexed_keys

        # read  workload settings
        bucketInfo = {"bucket": workload.bucket, "password": workload.password}

        ops_sec = workload.ops_per_sec

        create_count = int(ops_sec * workload.create_perc / 100)
        update_count = int(ops_sec * workload.update_perc / 100)
        get_count = int(ops_sec * workload.get_perc / 100)
        del_count = int(ops_sec * workload.del_perc / 100)
        exp_count = int(ops_sec * workload.exp_perc / 100)
        consume_queue = workload.consume_queue

        ttl = workload.ttl
        miss_queue = workload.miss_queue
        miss_perc = workload.miss_perc

        generate_pending_tasks(task_queue, template, bucketInfo, create_count,
                               update_count, get_count, del_count, exp_count,
                               consume_queue, ttl, miss_perc, miss_queue,
                               active_hosts)
def add_nodes(rest, servers='', cluster_id=cfg.CB_CLUSTER_TAG+"_status"):
    if servers.find('.') != -1 or servers == '':
        servers = servers.split()
    else:
        clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id)
        count = int(servers)
        if (len(clusterStatus.all_available_hosts) - len(clusterStatus.nodes)) >= int(count):
            servers = list(set(clusterStatus.all_available_hosts) - set(clusterStatus.get_all_hosts()))
        else:
            logger.error("Add nodes request invalid. # of nodes outside cluster is not enough")
            return
        servers = servers[:count]

    for server in servers:
        logger.error("Adding node %s" % server)
        ip, port = parse_server_arg(server)
        rest.add_node(cfg.COUCHBASE_USER, cfg.COUCHBASE_PWD, ip, port)
def add_nodes(rest, servers='', cluster_id=cfg.CB_CLUSTER_TAG+"_status"):
    if servers.find('.') != -1:
        servers = servers.split()
    else:
        clusterStatus = CacheHelper.clusterstatus(cluster_id) or ClusterStatus(cluster_id)
        count = int(servers)
        if (len(clusterStatus.all_available_hosts) - len(clusterStatus.nodes)) >= int(count):
            servers = list(set(clusterStatus.all_available_hosts) - set(clusterStatus.get_all_hosts()))
        else:
            logger.error("Rebalance in request invalid. # of nodes outside cluster is not enough")
            return
        servers = servers[:count]

    for server in servers:
        logger.error("Adding node %s" % server)
        ip, port = parse_server_arg(server)
        rest.add_node(cfg.COUCHBASE_USER, cfg.COUCHBASE_PWD, ip, port)
Beispiel #21
0
def queryRunner(max_msgs=10):

    rabbitHelper = queryRunner.rabbitHelper

    # check queue with pending http requests
    pending_http_requests = "query_multi_" + cfg.CB_CLUSTER_TAG
    if rabbitHelper.qsize(pending_http_requests) > max_msgs:

        # purge waiting tasks
        rabbitHelper.purge(pending_http_requests)
        query_ops_manager(max_msgs, True)

    else:

        hosts = None
        clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG +
                                                  "_status")

        if clusterStatus:
            hosts = clusterStatus.get_all_hosts()

        # retreive all active query workloads
        queries = CacheHelper.active_queries()
        for query in queries:

            # async update query workload object
            updateQueryWorkload.apply_async(args=[query])

            count = int(query.qps)
            filters = list(set(query.include_filters) -\
                           set(query.exclude_filters))
            params = generateQueryParams(query.indexed_key, query.bucket,
                                         filters, query.limit, query.startkey,
                                         query.endkey, query.startkey_docid,
                                         query.endkey_docid)
            multi_query.delay(count,
                              query.ddoc,
                              query.view,
                              params,
                              query.bucket,
                              query.password,
                              hosts=hosts)
def updateClusterStatus(ignore_result = True):

    done = False

    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\
        ClusterStatus()

    # check cluster nodes
    cached_nodes = clusterStatus.nodes
    new_cached_nodes = []

    for node in cached_nodes:

        # get an active node
        if clusterStatus.http_ping_node(node) is not None:

            # get remaining nodes
            active_nodes = clusterStatus.get_cluster_nodes(node)

            # populate cache with healthy nodes
            for active_node in active_nodes:
                if active_node.status == 'healthy':
                    new_cached_nodes.append(active_node)

            break



    if len(new_cached_nodes) > 0:

        # check for update
        new_node_list = ["%s:%s" % (n.ip, n.port) for n in new_cached_nodes]

        if len(new_node_list) != len(cached_nodes) or\
            len(set(clusterStatus.get_all_hosts()).intersection(new_node_list)) !=\
                len(cached_nodes):
            clusterStatus.nodes = new_cached_nodes
            clusterStatus.update_orchestrator()
    else:
        clusterStatus.orchestrator = None
        ObjCacher().delete(CacheHelper.CLUSTERSTATUSKEY, clusterStatus)
Beispiel #23
0
def pick_nodesToRemove(servers='', involve_orchestrator=False, cluster_id=cfg.CB_CLUSTER_TAG+"_status"):
    if servers.find('.') != -1 or servers == '':
        servers = servers.split()
    else:
        clusterStatus = CacheHelper.clusterstatus(cluster_id)
        count = int(servers)
        temp_count = count
        servers = []
        if involve_orchestrator:
            servers.append("%s:%s" % (clusterStatus.orchestrator.ip, clusterStatus.orchestrator.port))
            temp_count = temp_count -1

        if len(clusterStatus.nodes) > count:
            non_orchestrator_servers = list(set(clusterStatus.get_all_hosts()) -
                               set(["%s:%s" % (clusterStatus.orchestrator.ip, clusterStatus.orchestrator.port)]))
            servers.extend(non_orchestrator_servers[:temp_count])
        else:
            logger.error("Remove nodes request invalid. # of nodes in cluster is not enough")
            return []

    return servers
Beispiel #24
0
def queryRunner(max_msgs=10):

    rabbitHelper = queryRunner.rabbitHelper

    # check queue with pending http requests
    pending_http_requests = "query_multi_" + cfg.CB_CLUSTER_TAG
    if rabbitHelper.qsize(pending_http_requests) > max_msgs:

        # purge waiting tasks
        rabbitHelper.purge(pending_http_requests)
        query_ops_manager(max_msgs, True)

    else:

        hosts = None
        clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status")

        if clusterStatus:
            hosts = clusterStatus.get_all_hosts()

        # retreive all active query workloads
        queries = CacheHelper.active_queries()
        for query in queries:

            # async update query workload object
            updateQueryWorkload.apply_async(args=[query])

            count = int(query.qps)
            filters = list(set(query.include_filters) - set(query.exclude_filters))
            params = generateQueryParams(
                query.indexed_key,
                query.bucket,
                filters,
                query.limit,
                query.startkey,
                query.endkey,
                query.startkey_docid,
                query.endkey_docid,
            )
            multi_query.delay(count, query.ddoc, query.view, params, query.bucket, query.password, hosts=hosts)
def perform_xdcr_tasks(xdcrMsg):
    logger.error(xdcrMsg)
    src_master = create_server_obj()
    remote_id = ''
    if len(cfg.CB_REMOTE_CLUSTER_TAG) > 0:
        remote_id = cfg.CB_REMOTE_CLUSTER_TAG[0]+"_status"
    else:
        logger.error("No remote cluster tag. Can not create xdcr")
        return
    clusterStatus = CacheHelper.clusterstatus(remote_id) or ClusterStatus(remote_id)
    remote_ip = clusterStatus.get_random_host().split(":")[0]

    dest_master = create_server_obj(server_ip=remote_ip, username=xdcrMsg['dest_cluster_rest_username'],
                                    password=xdcrMsg['dest_cluster_rest_pwd'])
    dest_cluster_name = xdcrMsg['dest_cluster_name']
    xdcr_link_cluster(src_master, dest_master, dest_cluster_name)
    xdcr_start_replication(src_master, dest_cluster_name)

    if xdcrMsg['replication_type'] == "bidirection":
        src_cluster_name = dest_cluster_name + "_temp"
        xdcr_link_cluster(dest_master, src_master, src_cluster_name)
        xdcr_start_replication(dest_master, src_cluster_name)
def epengine_stat_checker(workload):

    postcondition = workload.postconditions

    if isinstance(postcondition, dict):
        params = parse_condition_dict(postcondition)
    else:
        params = parse_condition(postcondition)

    random_host, port = get_ep_hostip_from_params(params)

    status = True
    all_hosts = [random_host]
    if params['cluster_check'] == True:
        clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status")
        all_hosts = clusterStatus.get_all_hosts()

    for host in all_hosts:
        statChecker = EPStatChecker(host.split(":")[0], port)
        status &= statChecker.check(postcondition)

    return status
Beispiel #27
0
def updateClusterStatus(ignore_result=True):

    done = False

    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\
        ClusterStatus()

    # check cluster nodes
    cached_nodes = clusterStatus.nodes
    new_cached_nodes = []

    for node in cached_nodes:

        # get an active node
        if clusterStatus.http_ping_node(node) is not None:

            # get remaining nodes
            active_nodes = clusterStatus.get_cluster_nodes(node)

            # populate cache with healthy nodes
            for active_node in active_nodes:
                if active_node.status == 'healthy':
                    new_cached_nodes.append(active_node)

            break

    if len(new_cached_nodes) > 0:

        # check for update
        new_node_list = ["%s:%s" % (n.ip, n.port) for n in new_cached_nodes]

        if len(new_node_list) != len(cached_nodes) or\
            len(set(clusterStatus.get_all_hosts()).intersection(new_node_list)) !=\
                len(cached_nodes):
            clusterStatus.nodes = new_cached_nodes
            clusterStatus.update_orchestrator()
    else:
        clusterStatus.orchestrator = None
        ObjCacher().delete(CacheHelper.CLUSTERSTATUSKEY, clusterStatus)
Beispiel #28
0
def epengine_stat_checker(workload):

    postcondition = workload.postconditions

    if isinstance(postcondition, dict):
        params = parse_condition_dict(postcondition)
    else:
        params = parse_condition(postcondition)

    random_host, port = get_ep_hostip_from_params(params)

    status = True
    all_hosts = [random_host]
    if params['cluster_check'] == True:
        clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG +
                                                  "_status")
        all_hosts = clusterStatus.get_all_hosts()

    for host in all_hosts:
        statChecker = EPStatChecker(host.split(":")[0], port)
        status &= statChecker.check(postcondition)

    return status
Beispiel #29
0
def add_nodes(rest, servers='', cluster_id=cfg.CB_CLUSTER_TAG+"_status", zone_name = ''):

    # create zone if it does not exit
    if zone_name != '':
        if rest.is_zone_exist(zone_name) == False:
           rest.add_zone(zone_name)

    if servers.find('.') != -1 or servers == '':
        servers = servers.split()
    else:
        clusterStatus = CacheHelper.clusterstatus(cluster_id)
        count = int(servers)
        if (len(clusterStatus.all_available_hosts) - len(clusterStatus.nodes)) >= int(count):
            servers = list(set(clusterStatus.all_available_hosts) - set(clusterStatus.get_all_hosts()))
        else:
            logger.error("Add nodes request invalid. # of nodes outside cluster is not enough")
            return
        servers = servers[:count]

    for server in servers:
        logger.error("Adding node %s" % server)
        ip, port = parse_server_arg(server)
        rest.add_node(cfg.COUCHBASE_USER, cfg.COUCHBASE_PWD, ip, port, zone_name)
def perform_admin_tasks(adminMsg, cluster_id=cfg.CB_CLUSTER_TAG+"_status"):
    app.workload_manager.updateClusterStatus()
    clusterStatus = CacheHelper.clusterstatus(cluster_id)
    if clusterStatus is None:
        logger.error("Unable to fetch clusterStatus from cache: ")
        return

    rest = clusterStatus.node_rest()

    # Add nodes
    servers = adminMsg["rebalance_in"]
    zone_name = adminMsg["group"]
    if adminMsg["services"]:
        add_nodes(rest, servers, cluster_id, zone_name, adminMsg["services"])
    else:
        add_nodes(rest, servers, cluster_id, zone_name)
        # Get all nodes
    allNodes = []
    for node in rest.node_statuses():
        allNodes.append(node.id)

    # Remove nodes
    servers = adminMsg["rebalance_out"]
    toBeEjectedNodes  = remove_nodes(rest, servers, adminMsg["involve_orchestrator"], cluster_id)

    # Failover Node
    servers = adminMsg["failover"]
    auto_failover_servers = adminMsg["auto_failover"]
    only_failover = adminMsg["only_failover"]
    add_back_servers = adminMsg["add_back"]
    failoverNodes = failover_nodes(rest, servers, only_failover, adminMsg["involve_orchestrator"], cluster_id)
    autoFailoverNodes = auto_failover_nodes(rest, auto_failover_servers, only_failover, adminMsg["involve_orchestrator"], cluster_id)

    app.workload_manager.updateClusterStatus()
    clusterStatus = CacheHelper.clusterstatus(cluster_id)
    rest = clusterStatus.node_rest()
    addBackNodes = add_back_nodes(rest, add_back_servers, autoFailoverNodes+failoverNodes)
    toBeEjectedNodes.extend(failoverNodes)
    toBeEjectedNodes.extend(autoFailoverNodes)
    for node in addBackNodes:
        toBeEjectedNodes.remove(node)

    # SoftRestart a node
    servers = adminMsg["soft_restart"]
    restart(servers, cluster_id=cluster_id)

    # HardRestart a node
    servers = adminMsg["hard_restart"]
    restart(servers, type='hard', cluster_id=cluster_id)

    if adminMsg["soft_restart"] == '' and adminMsg["hard_restart"] == '':
        if not only_failover and (len(allNodes) > 0 or len(toBeEjectedNodes) > 0):
            logger.error("Rebalance")
            logger.error(allNodes)
            logger.error(toBeEjectedNodes)
            rest.rebalance(otpNodes=allNodes, ejectedNodes=toBeEjectedNodes)

    # do a soft rest on ejectedNodes that were failed over
    logger.error(toBeEjectedNodes)
    restartNodes = ""
    for node in toBeEjectedNodes:
        if node in (failoverNodes + autoFailoverNodes):
            if '@' in node:  # ns_X@hostname  formated
                node = node.split('@')[1]
            restartNodes = "%s %s" % (node, restartNodes)
    if len(restartNodes):
        restart(restartNodes)
Beispiel #31
0
def report_kv_latency(bucket="default"):

    if cfg.SERIESLY_IP == '':
        # seriesly not configured
        return

    rabbitHelper = report_kv_latency.rabbitHelper
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status") or\
        ClusterStatus()

    host = clusterStatus.get_random_host()
    if host is None: return

    ip, port = host.split(':')

    workloads = CacheHelper.workloads()
    for workload in workloads:
        if workload.active and workload.bucket == bucket:

            # read workload params
            bucket = str(workload.bucket)
            password = str(workload.password)

            # read template from active workload
            template = Template.from_cache(str(workload.template))
            template = template.__dict__
            client.decodeMajgicStrings(template)

            # setup key/val to use for timing
            key = _random_string(12)
            value = json.dumps(template['kv'])
            get_key = key

            # for get op, try to pull from consume_queue
            # so that we can calc impact of dgm
            consume_queue = workload.consume_queue
            if consume_queue is not None:
                keys = rabbitHelper.getJsonMsg(str(consume_queue),
                                               requeue=True)
                if len(keys) > 0:
                    get_key = str(keys['start'])

            # collect op latency
            set_latency = client.mc_op_latency('set', key, value, ip, port,
                                               bucket, password)
            get_latency = client.mc_op_latency('get', get_key, value, ip, port,
                                               bucket, password)
            delete_latency = client.mc_op_latency('delete', key, value, ip,
                                                  port, bucket, password)

            # report to seriessly
            seriesly = Seriesly(cfg.SERIESLY_IP, 3133)
            db = None
            if 'fast' in seriesly.list_dbs():
                db = 'fast'
            else:
                bucketStatus = BucketStatus.from_cache(bucket) or BucketStatus(
                    bucket)
                db = bucketStatus.latency_db
                if db not in seriesly.list_dbs():
                    seriesly.create_db(db)

            if db is not None:
                seriesly[db].append({
                    'set_latency': set_latency,
                    'get_latency': get_latency,
                    'delete_latency': delete_latency
                })
Beispiel #32
0
def run(workload):

    workload.active = True
    rabbitHelper = RabbitHelper()
    sdk_queue_key = "sdk_consumer.*"

    # read doc template
    template = Template.from_cache(str(workload.template))
    if template is None:
        logger.error("no doc template imported")
        return

    consumer_template = copy.deepcopy(template)
    bucket = str(workload.bucket)
    password = str(workload.password)

    active_hosts = None
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG + "_status")
    if clusterStatus is not None:
        active_hosts = clusterStatus.get_all_hosts()

    if workload.cc_queues is not None:
        # override template attribute with workload
        consumer_template.cc_queues = workload.cc_queues

    if len(workload.indexed_keys) > 0:
        template.indexed_keys = workload.indexed_keys

    ops_sec = workload.ops_per_sec

    # modify ops by number of consumers
    num_consumers = rabbitHelper.numExchangeQueues(cfg.CB_CLUSTER_TAG,
                                                   EXCHANGE)

    if num_consumers == 0:
        logger.error("No sdkclients running")
        return

    ops_sec = int(ops_sec) / num_consumers
    create_count = int(ops_sec * workload.create_perc / 100)
    update_count = int(ops_sec * workload.update_perc / 100)
    get_count = int(ops_sec * workload.get_perc / 100)
    del_count = int(ops_sec * workload.del_perc / 100)
    exp_count = int(ops_sec * workload.exp_perc / 100)
    consume_queue = workload.consume_queue

    ttl = workload.ttl
    miss_queue = workload.miss_queue
    miss_perc = workload.miss_perc

    # broadcast to sdk_consumers
    msg = {
        'bucket': bucket,
        'id': workload.id,
        'password': password,
        'template': consumer_template.__dict__,
        'ops_sec': ops_sec,
        'create_count': create_count,
        'update_count': update_count,
        'get_count': get_count,
        'del_count': del_count,
        'exp_count': exp_count,
        'consume_queue': consume_queue,
        'ttl': ttl,
        'miss_perc': miss_perc,
        'active': True,
        'active_hosts': active_hosts
    }

    rabbitHelper.putMsg('', json.dumps(msg), EXCHANGE)
    logger.error("start task sent to %s consumers" % num_consumers)
Beispiel #33
0
def perform_admin_tasks(adminMsg, cluster_id=cfg.CB_CLUSTER_TAG+"_status"):
    app.workload_manager.updateClusterStatus()
    clusterStatus = CacheHelper.clusterstatus(cluster_id)
    if clusterStatus is None:
        logger.error("Unable to fetch clusterStatus from cache: ")
        return

    rest = clusterStatus.node_rest()

    # Add nodes
    servers = adminMsg["rebalance_in"]
    zone_name = adminMsg["group"]
    add_nodes(rest, servers, cluster_id, zone_name)

    # Get all nodes
    allNodes = []
    for node in rest.node_statuses():
        allNodes.append(node.id)

    # Remove nodes
    servers = adminMsg["rebalance_out"]
    toBeEjectedNodes  = remove_nodes(rest, servers, adminMsg["involve_orchestrator"], cluster_id)

    # Failover Node
    servers = adminMsg["failover"]
    auto_failover_servers = adminMsg["auto_failover"]
    only_failover = adminMsg["only_failover"]
    add_back_servers = adminMsg["add_back"]
    failoverNodes = failover_nodes(rest, servers, only_failover, adminMsg["involve_orchestrator"], cluster_id)
    autoFailoverNodes = auto_failover_nodes(rest, auto_failover_servers, only_failover, adminMsg["involve_orchestrator"], cluster_id)

    app.workload_manager.updateClusterStatus()
    clusterStatus = CacheHelper.clusterstatus(cluster_id)
    rest = clusterStatus.node_rest()
    addBackNodes = add_back_nodes(rest, add_back_servers, autoFailoverNodes+failoverNodes)
    toBeEjectedNodes.extend(failoverNodes)
    toBeEjectedNodes.extend(autoFailoverNodes)
    for node in addBackNodes:
        toBeEjectedNodes.remove(node)

    # SoftRestart a node
    servers = adminMsg["soft_restart"]
    restart(servers, cluster_id=cluster_id)

    # HardRestart a node
    servers = adminMsg["hard_restart"]
    restart(servers, type='hard', cluster_id=cluster_id)

    if adminMsg["soft_restart"] == '' and adminMsg["hard_restart"] == '':
        if not only_failover and (len(allNodes) > 0 or len(toBeEjectedNodes) > 0):
            logger.error("Rebalance")
            logger.error(allNodes)
            logger.error(toBeEjectedNodes)
            rest.rebalance(otpNodes=allNodes, ejectedNodes=toBeEjectedNodes)

    # do a soft rest on ejectedNodes that were failed over
    logger.error(toBeEjectedNodes)
    restartNodes = ""
    for node in toBeEjectedNodes:
        if node in (failoverNodes + autoFailoverNodes):
            if '@' in node:  # ns_X@hostname  formated
                node = node.split('@')[1]
            restartNodes = "%s %s" % (node, restartNodes)
    if len(restartNodes):
        restart(restartNodes)
def run(workload):

    workload.active = True
    rabbitHelper = RabbitHelper()
    sdk_queue_key = "sdk_consumer.*"

    # read doc template
    template = Template.from_cache(str(workload.template))
    if template is None:
        logger.error("no doc template imported")
        return

    consumer_template = copy.deepcopy(template)
    bucket = str(workload.bucket)
    password = str(workload.password)

    active_hosts = None
    clusterStatus = CacheHelper.clusterstatus(cfg.CB_CLUSTER_TAG+"_status")
    if clusterStatus is not None:
        active_hosts = clusterStatus.get_all_hosts()


    if workload.cc_queues is not None:
        # override template attribute with workload
        consumer_template.cc_queues = workload.cc_queues

    if len(workload.indexed_keys) > 0:
        template.indexed_keys = workload.indexed_keys


    ops_sec = workload.ops_per_sec

    # modify ops by number of consumers
    num_consumers = rabbitHelper.numExchangeQueues(cfg.CB_CLUSTER_TAG, EXCHANGE)

    if num_consumers == 0:
        logger.error("No sdkclients running")
        return

    ops_sec = int(ops_sec)/num_consumers
    create_count = int(ops_sec *  workload.create_perc/100)
    update_count = int(ops_sec *  workload.update_perc/100)
    get_count = int(ops_sec *  workload.get_perc/100)
    del_count = int(ops_sec *  workload.del_perc/100)
    exp_count = int(ops_sec *  workload.exp_perc/100)
    consume_queue =  workload.consume_queue

    ttl = workload.ttl
    miss_queue = workload.miss_queue
    miss_perc = workload.miss_perc

    # broadcast to sdk_consumers
    msg = {'bucket' : bucket,
           'id' : workload.id,
           'password' : password,
           'template' : consumer_template.__dict__,
           'ops_sec' : ops_sec,
           'create_count' : create_count,
           'update_count' : update_count,
           'get_count' : get_count,
           'del_count' : del_count,
           'exp_count' : exp_count,
           'consume_queue' : consume_queue,
           'ttl' : ttl,
           'miss_perc' : miss_perc,
           'active' : True,
           'active_hosts' : active_hosts}

    rabbitHelper.putMsg('', json.dumps(msg), EXCHANGE)
    logger.error("start task sent to %s consumers" % num_consumers)