Ejemplo n.º 1
0
def rebalance(runner):

    # find out about the cluster.
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['OVERVIEW'])

    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(tuple[0], tuple[1], tuple[2])

    #stop list
    target_host_ids = []
    targets = []
    if runner.opts.stoplist:
        targets = runner.opts.stoplist.split(',')
        for host in targets:
            target = hosts.get_host(host)
            if target == None:
                runner.abort('Host not found in cluster: %s' % host)
            else:
                target_host_ids.append(str(target.id))

    # Connect to an arbitrary host that isn't being stopped.
    chost = hosts.get_connection_host(targets)
    if chost is None:
        runner.abort(
            'Could not find a host other than the hosts to be stoppted.in cluster: %s'
            % runner.opts.target_host)

    user_info = ''
    if runner.opts.username:
        user_info = ', user: %s' % runner.opts.username

    runner.info('Connecting to %s:%d%s (%s) to issue "rebalance" command' %
                (chost.get_admininterface(), chost.adminport, user_info,
                 chost.hostname))

    #set up connections
    runner.voltdb_connect(chost.get_admininterface(), chost.adminport,
                          runner.opts.username, runner.opts.password)

    json_opts = ['command:"rebalance"']
    if target_host_ids:
        stoplist = 'hosts:%s' % ("-".join(target_host_ids))
        json_opts.append(stoplist)

    if runner.opts.kfactor:
        json_opts.append('kfactor:%s' % (runner.opts.kfactor))

    if runner.opts.hostcount:
        json_opts.append('hostcount:%s' % (runner.opts.hostcount))

    if not runner.opts.dryrun:
        response = runner.call_proc('@Rebalance',
                                    [VOLT.FastSerializer.VOLTTYPE_STRING],
                                    ['{%s}' % (','.join(json_opts))])
        print response
Ejemplo n.º 2
0
def resize(runner):
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['OVERVIEW'])

    # Convert @SystemInformation results to objects.
    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(*tuple)

    # get current version and root directory from an arbitrary node
    host = next(iter(hosts.hosts_by_id.values()))

    # check the version of target cluster to make it work properly.
    version = host.version
    versionStr = version.split('.')
    majorVersion = int(versionStr[0])
    minorVersion = int(versionStr[1])
    if "license" not in host:
        runner.abort('Elastic resize only available for enterprise edition.')

    if majorVersion < RELEASE_MAJOR_VERSION or (
            majorVersion == RELEASE_MAJOR_VERSION
            and minorVersion < RELEASE_MINOR_VERSION):
        runner.abort('The version of targeting cluster is ' + version +
                     ' which is lower than version ' +
                     str(RELEASE_MAJOR_VERSION) + '.' +
                     str(RELEASE_MINOR_VERSION) +
                     ' for supporting elastic resize.')

    # Convert shutdown delay input of minutes to millis
    shutdown_delay = runner.opts.shutdown_delay * 60000 if runner.opts.shutdown_delay > 0 else -1
    option = runner.opts.opt
    result = runner.call_proc('@ElasticRemoveNT', [
        VOLT.FastSerializer.VOLTTYPE_TINYINT,
        VOLT.FastSerializer.VOLTTYPE_STRING,
        VOLT.FastSerializer.VOLTTYPE_STRING,
        VOLT.FastSerializer.VOLTTYPE_BIGINT
    ], [option, '', ','.join(runner.opts.skip_requirements), shutdown_delay
        ]).table(0)
    status = result.tuple(0).column_integer(0)
    message = result.tuple(0).column_string(1)
    if option in (Option.TEST, Option.START) and "host ids:" in message:
        host_names = ', '.join([
            hostIdsToNames(id, hosts) for id in re.search(
                'host ids: \[(.+?)\]', message).group(1).split(',')
        ])
        if option == Option.TEST:
            message = "Hosts will be removed: [" + host_names + "], " + message
        elif option == Option.START:
            message = "Starting cluster resize: Removing hosts: [" + host_names + "], " + message
    if status == 0:
        runner.info(message)
    else:
        runner.error(message)
        sys.exit(1)
Ejemplo n.º 3
0
def basicCheck(runner):
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['OVERVIEW'])

    # Convert @SystemInformation results to objects.
    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(tuple[0], tuple[1], tuple[2])

    # get current version and root directory from an arbitrary node
    host = hosts.hosts_by_id.itervalues().next()
    fullClusterSize = int(host.fullclustersize)
    if len(hosts.hosts_by_id) < fullClusterSize:
        runner.abort(
            "Current cluster needs %d more node(s) to achieve full K-safety. In-service upgrade is not recommended in partial K-safety cluster."
            % (fullClusterSize - len(hosts.hosts_by_id)))

    if fullClusterSize % 2 == 1 and runner.opts.newNode is None:
        runner.abort(
            "The cluster has odd number of nodes, plan_upgrade needs an extra node to generate the instructions"
        )

    host = hosts.hosts_by_id.itervalues().next()
    currentVersion = host.version
    currentVoltDBRoot = host.voltdbroot
    currentDeployment = host.deployment
    xmlroot = ElementTree.parse(currentDeployment).getroot()
    cluster = xmlroot.find("./cluster")
    if cluster is None:
        runner.abort("Couldn't find cluster tag in current deployment file")
    kfactor_tag = cluster.get('kfactor')
    if kfactor_tag is None:
        kfactor = 0
    else:
        kfactor = int(kfactor_tag)

    # sanity check, in case of nodes number or K-factor is less than required
    # K = 0, abort with error message
    if kfactor == 0:
        runner.abort(
            "Current cluster doesn't have duplicate partitions to perform in-service upgrade. K-factor: %d"
            % kfactor)

    # N = 1, abort with error message
    if fullClusterSize == 1:
        runner.abort(
            "Current cluster doesn't have enough node to perform in-service upgrade, at least two nodes are required"
        )

    return hosts, kfactor
Ejemplo n.º 4
0
def stop(runner):

    # Exec @SystemInformation to find out about the cluster.
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['OVERVIEW'])

    # Convert @SystemInformation results to objects.
    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(tuple[0], tuple[1], tuple[2])

    # Connect to an arbitrary host that isn't being stopped.
    defaultport = 3021
    min_hosts = 1
    max_hosts = 1
    target_host = utility.parse_hosts(runner.opts.target_host, min_hosts,
                                      max_hosts, defaultport)[0]
    (thost,
     chost) = hosts.get_target_and_connection_host(target_host.host,
                                                   target_host.port)
    if thost is None:
        runner.abort('Host not found in cluster: %s:%d' %
                     (target_host.host, target_host.port))
    if chost is None:
        runner.abort(
            'The entire cluster is being stopped, use "shutdown" instead.')

    if runner.opts.username:
        user_info = ', user: %s' % runner.opts.username
    else:
        user_info = ''
    runner.info('Connecting to %s:%d%s (%s) to issue "stop" command' %
                (chost.get_admininterface(), chost.adminport, user_info,
                 chost.hostname))
    runner.voltdb_connect(chost.get_admininterface(), chost.adminport,
                          runner.opts.username, runner.opts.password,
                          runner.opts.ssl_config, runner.opts.kerberos)

    # Stop the requested host using exec @StopNode HOST_ID
    runner.info('Stopping host %d: %s:%s' %
                (thost.id, thost.hostname, thost.internalport))
    if not runner.opts.dryrun:
        response = runner.call_proc('@StopNode',
                                    [VOLT.FastSerializer.VOLTTYPE_INTEGER],
                                    [thost.id],
                                    check_status=False)
        print response
        if response.status() != 1:  # not SUCCESS
            sys.exit(1)
Ejemplo n.º 5
0
def findTargetHsId(runner):
    # Exec @SystemInformation to find out about the cluster.
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['OVERVIEW'])

    # Convert @SystemInformation results to objects.
    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(tuple[0], tuple[1], tuple[2])
    # Connect to an arbitrary host that isn't being stopped.
    defaultport = 3021
    min_hosts = 1
    max_hosts = 1
    target_host = utility.parse_hosts(runner.opts.target_host, min_hosts, max_hosts, defaultport)[0]
    (thost, chost) = hosts.get_target_and_connection_host(target_host.host, target_host.port)
    if thost is None:
        runner.abort('Host not found in cluster: %s:%d' % (target_host.host, target_host.port))
    return thost.id
Ejemplo n.º 6
0
def getOwnClusterId(runner):
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['OVERVIEW'])

    # Convert @SystemInformation results to objects.
    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(tuple[0], tuple[1], tuple[2])

    # get current version and root directory from an arbitrary node
    host = hosts.hosts_by_id.itervalues().next()

    # ClusterId in @SystemInformation is added in v7.2, so must check the version of target cluster to make it work properly.
    version = host.version
    versionStr = version.split('.')
    majorVersion = int(versionStr[0])
    minorVersion = int(versionStr[1])
    if majorVersion < RELEASE_MAJOR_VERSION or (majorVersion == RELEASE_MAJOR_VERSION and minorVersion < RELEASE_MINOR_VERSION):
        return -1

    return int(host.clusterid)
Ejemplo n.º 7
0
def procedureCaller(runner, type):
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['OVERVIEW'])

    # Convert @SystemInformation results to objects.
    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(tuple[0], tuple[1], tuple[2])

    # get current version and root directory from an arbitrary node
    host = hosts.hosts_by_id.itervalues().next()

    # check the version of target cluster to make it work properly.
    version = host.version
    versionStr = version.split('.')
    majorVersion = int(versionStr[0])
    minorVersion = int(versionStr[1])
    if majorVersion < RELEASE_MAJOR_VERSION or (
            majorVersion == RELEASE_MAJOR_VERSION
            and minorVersion < RELEASE_MINOR_VERSION):
        runner.abort('The version of targeting cluster is ' + version +
                     ' which is lower than version ' +
                     str(RELEASE_MAJOR_VERSION) + '.' +
                     str(RELEASE_MINOR_VERSION) +
                     ' for supporting elastic resize.')

    result = runner.call_proc('@ElasticRemoveNT', [
        VOLT.FastSerializer.VOLTTYPE_TINYINT,
        VOLT.FastSerializer.VOLTTYPE_STRING,
        VOLT.FastSerializer.VOLTTYPE_STRING
    ], [type, '', ','.join(runner.opts.skip_requirements)]).table(0)
    status = result.tuple(0).column_integer(0)
    message = result.tuple(0).column_string(1)
    if status == 0:
        runner.info(message)
    else:
        runner.error(message)
        sys.exit(1)
Ejemplo n.º 8
0
def getClusterInfo(runner, available_hosts, clearHostCache):
    # raise execption when failed to connect
    response = runner.call_proc('@SystemInformation',
                                    [VOLT.FastSerializer.VOLTTYPE_STRING],
                                    ['OVERVIEW'],
                                    True, None, True)
    if response.response.status != 1:
        return None;

    if clearHostCache:
        available_hosts[:] = []

    # Convert @SystemInformation results to objects.
    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(tuple[0], tuple[1], tuple[2])

    for hostId, hostInfo in hosts.hosts_by_id.items():
        if hostInfo.hostname not in available_hosts:
            available_hosts.append(hostInfo.hostname + ":" + str(hostInfo.clientport))

    # get current version and root directory from an arbitrary node
    host = hosts.hosts_by_id.itervalues().next()

    # ClusterId in @SystemInformation is added in v7.2, so must check the version of target cluster to make it work properly.
    version = host.version
    versionStr = version.split('.')
    majorVersion = int(versionStr[0])
    minorVersion = int(versionStr[1])
    if majorVersion < RELEASE_MAJOR_VERSION or (majorVersion == RELEASE_MAJOR_VERSION and minorVersion < RELEASE_MINOR_VERSION):
        runner.abort("Only v7.2 or higher version of VoltDB supports this command. Target cluster running on v" + version + ".")

    clusterId = host.clusterid
    fullClusterSize = int(host.fullclustersize)
    uptime = host.uptime

    response = runner.call_proc('@SystemInformation',
                                    [VOLT.FastSerializer.VOLTTYPE_STRING],
                                    ['DEPLOYMENT'],
                                    True, None, True)
    for tuple in response.table(0).tuples():
        if tuple[0] == 'kfactor':
            kfactor = tuple[1]
            break


    cluster = Cluster(int(clusterId), version, int(kfactor), int(fullClusterSize), uptime)
    for hostId, hostInfo in hosts.hosts_by_id.items():
        cluster.add_member(hostId, hostInfo.hostname)

    # number of live clients connect to the cluster
    try:
        response = checkstats.get_stats(runner, "LIVECLIENTS")
    except StatisticsProcedureException as e:
        runner.info(e.message)
        sys.exit(e.exitCode)

    liveclients = 0
    for tuple in response.table(0).tuples():
        isAdmin = tuple[5]
        # exclude admin connections
        if isAdmin != 1:
            liveclients += 1
    cluster.update_live_clients(liveclients)

    if runner.opts.dr:
        # Do we have any ongoing DR conversation?
        try:
            response = checkstats.get_stats(runner, "DRROLE")
        except StatisticsProcedureException as e:
            runner.info(e.message)
            sys.exit(e.exitCode)

        for tuple in response.table(0).tuples():
            role = tuple[0]
            status = tuple[1]
            remote_cluster_id = tuple[2]
            if (remote_cluster_id != -1):
                cluster.add_remote_cluster(remote_cluster_id, status, role)

        try:
            response = checkstats.get_stats(runner, "DRPRODUCER")
        except StatisticsProcedureException as e:
            runner.info(e.message)
            sys.exit(e.exitCode)
        for tuple in response.table(0).tuples():
            host_name = tuple[2]
            remote_cluster_id = tuple[4]
            last_queued_drid = tuple[10]
            last_queued_ts = tuple[12]
            last_acked_ts = tuple[13]
            if last_queued_drid == -1:
                delay = 0
            else:
                delay = (last_queued_ts - last_acked_ts).total_seconds()
            cluster.get_remote_cluster(remote_cluster_id).update_producer_latency(host_name, remote_cluster_id, delay)

        # Find remote topology through drconsumer stats
        try:
            response = checkstats.get_stats(runner, "DRCONSUMER")
        except StatisticsProcedureException as e:
            runner.info(e.message)
            sys.exit(e.exitCode)

        for tuple in response.table(1).tuples():
            remote_cluster_id = tuple[4]
            covering_host = tuple[7]
            last_applied_ts = tuple[9]
            if covering_host != '':
                cluster.get_remote_cluster(remote_cluster_id).add_remote_member(covering_host)
    return cluster
Ejemplo n.º 9
0
def basicCheck(runner):
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['OVERVIEW'])

    # Convert @SystemInformation results to objects.
    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(tuple[0], tuple[1], tuple[2])

    # get current version and root directory from an arbitrary node
    host = next(iter(hosts.hosts_by_id.values()))
    fullClusterSize = int(host.fullclustersize)
    if len(hosts.hosts_by_id) < fullClusterSize:
        delta = fullClusterSize - len(hosts.hosts_by_id)
        runner.abort(
            "Current cluster needs %d more node%s to achieve full K-safety. Online upgrade is not supported in partial K-safety cluster."
            % (delta, "" if delta == 1 else "s"))

    if fullClusterSize % 2 == 1 and runner.opts.newNode is None:
        runner.abort(
            "The cluster has odd number of nodes, plan_upgrade needs an extra node to generate the instructions"
        )
    if fullClusterSize % 2 == 0 and runner.opts.newNode is not None:
        runner.abort(
            "For even-numbered cluster, 2 parameters are expected (received 3)."
        )

    if runner.opts.newNode is not None:
        result = checkNewNode(runner.opts.newNode)
        if result is not None:
            runner.abort("Failed to resolve host {0}:{1}.".format(
                runner.opts.newNode, result))

    host = next(iter(hosts.hosts_by_id.values()))
    currentVersion = host.version

    # get k-factor from @SystemInformation
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['DEPLOYMENT'])
    for tuple in response.table(0).tuples():
        if tuple[0] == 'kfactor':
            kfactor = tuple[1]
            break

    # sanity check, in case of nodes number or K-factor is less than required
    # K = 0, abort with error message
    if kfactor == 0:
        runner.abort(
            "Current cluster doesn't have duplicate partitions to perform online upgrade. K-factor: %d"
            % kfactor)

    # N = 1, abort with error message
    if fullClusterSize == 1:
        runner.abort(
            "Current cluster doesn't have enough nodes to perform online upgrade, at least two nodes are required"
        )

    response = checkstats.get_stats(runner, "DRROLE")
    clusterIds = []
    clusterIds.append(int(host.clusterid))  # add local cluster id first
    for tuple in response.table(0).tuples():
        remote_cluster_id = tuple[2]
        if remote_cluster_id != -1:
            clusterIds.append(
                remote_cluster_id)  # add remote cluster id if exists
    if len(clusterIds) == 127:
        runner.abort(
            "Failed to generate upgrade plan: number of connected cluster reaches the maximum limit (127)."
        )

    # Check the existence of voltdb root path and new kit on all the existing nodes
    response = runner.call_proc('@CheckUpgradePlanNT', [
        VOLT.FastSerializer.VOLTTYPE_STRING,
        VOLT.FastSerializer.VOLTTYPE_STRING
    ], [runner.opts.newKit, runner.opts.newRoot])
    error = False
    warnings = ""
    for tuple in response.table(0).tuples():
        hostId = tuple[0]
        result = tuple[1]
        warning = tuple[2]
        if result != 'Success':
            error = True
            host = hosts.hosts_by_id[hostId]
            if host is None:
                runner.abort('@CheckUpgradePlanNT returns a host id ' +
                             hostId + " that doesn't belong to the cluster.")
            runner.error('Check failed on host ' + getHostnameOrIp(host) +
                         " with the cause: " + result)
        if warning is not None:
            host = hosts.hosts_by_id[hostId]
            if host is None:
                runner.abort('@CheckUpgradePlanNT returns a host id ' +
                             hostId + " that doesn't belong to the cluster.")
            warnings += 'On host ' + getHostnameOrIp(
                host) + ': \n' + warning + '\n'

    if error:
        runner.abort("Failed to pass pre-upgrade check. Abort. ")
    if warnings != "":
        runner.warning(warnings[:-1])  # get rid of last '\n'

    print('[1/4] Passed new VoltDB kit version check.')
    print('[2/4] Passed new VoltDB root path existence check.')

    return hosts, kfactor, clusterIds
Ejemplo n.º 10
0
def stop(runner):

    # Exec @SystemInformation to find out about the cluster.
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['OVERVIEW'])

    # Convert @SystemInformation results to objects.
    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(tuple[0], tuple[1], tuple[2])

    # Connect to an arbitrary host that isn't being stopped.
    defaultport = 3021
    min_hosts = 1
    max_hosts = 1
    target_host = utility.parse_hosts(runner.opts.target_host, min_hosts,
                                      max_hosts, defaultport)[0]
    (thost,
     chost) = hosts.get_target_and_connection_host(target_host.host,
                                                   target_host.port)
    if thost is None:
        runner.abort('Host not found in cluster: %s:%d' %
                     (target_host.host, target_host.port))
    if chost is None:
        runner.abort(
            'The entire cluster is being stopped, use "shutdown" instead.')

    if runner.opts.username:
        user_info = ', user: %s' % runner.opts.username
    else:
        user_info = ''
    runner.info('Connecting to %s:%d%s (%s) to issue "stop" command' %
                (chost.get_admininterface(), chost.adminport, user_info,
                 chost.hostname))
    runner.voltdb_connect(chost.get_admininterface(), chost.adminport,
                          runner.opts.username, runner.opts.password,
                          runner.opts.ssl_config, runner.opts.kerberos)

    if not runner.opts.forcing:
        stateMessage = 'The node shutdown process has stopped.'
        actionMessage = 'You may shutdown the node with the "voltadmin stop --force" command.'
        try:
            runner.info('Preparing for stopping node.')
            resp = runner.call_proc('@PrepareStopNode',
                                    [VOLT.FastSerializer.VOLTTYPE_INTEGER],
                                    [thost.id],
                                    check_status=False)
            if resp.status() != 1:
                runner.abort(
                    'The preparation for node shutdown failed with status: %s'
                    % resp.response.statusString)
            # monitor partition leader migration
            runner.info(
                'Completing partition leader migration away from host %d: %s' %
                (thost.id, thost.hostname))
            checkstats.check_partition_leaders_on_host(runner, thost.id)
            runner.info('All partition leaders have been migrated.')
            # monitor export master transfer, but don't fail on timeout: target may have been
            # disabled, preventing transfer. In that case it's ok to proceed with the stop
            try:
                runner.info(
                    'Completing export master transfer away from host %d: %s' %
                    (thost.id, thost.hostname))
                checkstats.check_export_mastership_on_host(runner, thost.id)
                runner.info('All export masters have been transferred')
            except StatisticsProcedureException as proex:
                if not proex.isTimeout:
                    raise
                runner.info(proex.message)
                runner.info(
                    'This may be caused by an export target either disabled or removed from the configuration. No action is required; the stop node process will proceed.'
                )
        except StatisticsProcedureException as proex:
            runner.info(stateMessage)
            runner.error(proex.message)
            if proex.isTimeout:
                runner.info(actionMessage)
            sys.exit(proex.exitCode)
        except (KeyboardInterrupt, SystemExit):
            runner.info(stateMessage)
            runner.abort(actionMessage)
    # Stop the requested host using exec @StopNode HOST_ID
    runner.info('Stopping host %d: %s:%s' %
                (thost.id, thost.hostname, thost.internalport))
    if not runner.opts.dryrun:
        response = runner.call_proc('@StopNode',
                                    [VOLT.FastSerializer.VOLTTYPE_INTEGER],
                                    [thost.id],
                                    check_status=False)
        print response
        if response.status() != 1:  # not SUCCESS
            sys.exit(1)
Ejemplo n.º 11
0
def shutdown(runner):
    if runner.opts.forcing and runner.opts.save:
        runner.abort_with_help(
            'You cannot specify both --force and --save options.')
    if runner.opts.cancel and runner.opts.save:
        runner.abort_with_help(
            'You cannot specify both --cancel and --save options.')
    if runner.opts.cancel and runner.opts.forcing:
        runner.abort_with_help(
            'You cannot specify both --cancel and --force options.')
    if runner.opts.timeout <= 0:
        runner.abort_with_help(
            'The timeout value must be more than zero seconds.')
    shutdown_params = []
    columns = []
    zk_pause_txnid = 0

    if runner.opts.cancel:
        runner.info('Canceling cluster shutdown ...')
        response = runner.call_proc('@CancelShutdown', [], [])
        if response.status() != 1:
            runner.abort('Cancel shutdown failed with status: %d' %
                         resp.response.statusString)
        else:
            runner.info('Shutdown canceled.')
    else:
        runner.info('Cluster shutdown in progress.')
        if not runner.opts.forcing:
            response = runner.call_proc('@SystemInformation',
                                        [VOLT.FastSerializer.VOLTTYPE_STRING],
                                        ['OVERVIEW'])

            # Convert @SystemInformation results to objects.
            hosts = Hosts(runner.abort)
            for tuple in response.table(0).tuples():
                hosts.update(*tuple)
            host = hosts.hosts_by_id.itervalues().next()
            if host.get('clustersafety') == "REDUCED":
                runner.info(
                    'Since cluster is in reduced k safety mode, taking a final snapshot before shutdown.'
                )
                runner.opts.save = True

            stateMessage = 'The cluster shutdown process has stopped. The cluster is still in a paused state.'
            actionMessage = 'You may shutdown the cluster with the "voltadmin shutdown --force" command, '\
                            + 'continue to wait with "voltadmin shutdown",\n'\
                            + 'or cancel the shutdown with the "voltadmin shutdown --cancel" command'
            try:
                runner.info('Preparing for shutdown...')
                resp = runner.call_proc('@PrepareShutdown', [], [])
                if resp.status() != 1:
                    runner.abort(
                        'The preparation for shutdown failed with status: %d' %
                        resp.response.statusString)
                zk_pause_txnid = resp.table(0).tuple(0).column_integer(0)
                runner.info('The cluster is paused prior to shutdown.')

                runner.info('Writing out all queued export data...')
                status = runner.call_proc(
                    '@Quiesce', [], []).table(0).tuple(0).column_integer(0)
                if status <> 0:
                    runner.abort(
                        'The cluster has failed to be quiesce with status: %d'
                        % status)

                checkstats.check_clients(runner)
                checkstats.check_importer(runner)

                # Checking command log regardless of whether we're community or enterprise
                checkstats.check_command_log(runner)
                runner.info(
                    'If running Enterprise Edition, all transactions have been made durable.'
                )

                if runner.opts.save:
                    actionMessage = 'You may shutdown the cluster with the "voltadmin shutdown --force" command, '\
                             + 'continue to wait with "voltadmin shutdown --save",\n'\
                             + 'or cancel the shutdown with the "voltadmin shutdown --cancel" command.'
                    columns = [VOLT.FastSerializer.VOLTTYPE_BIGINT]
                    shutdown_params = [zk_pause_txnid]
                    # save option, check more stats
                    checkstats.check_dr_consumer(runner)
                    runner.info(
                        'Starting resolution of external commitments...')
                    checkstats.check_exporter(runner)
                    status = runner.call_proc(
                        '@Quiesce', [], []).table(0).tuple(0).column_integer(0)
                    if status <> 0:
                        runner.abort(
                            'The cluster has failed to quiesce with status: %d'
                            % status)
                    checkstats.check_dr_producer(runner)
                    runner.info(
                        'Saving a final snapshot, The cluster will shutdown after the snapshot is finished...'
                    )
                else:
                    checkstats.check_exporter(runner)
                    runner.info('Shutting down the cluster...')
            except StatisticsProcedureException as proex:
                runner.info(stateMessage)
                runner.error(proex.message)
                if proex.isTimeout:
                    runner.info(actionMessage)
                sys.exit(proex.exitCode)
            except (KeyboardInterrupt, SystemExit):
                runner.info(stateMessage)
                runner.abort(actionMessage)
        response = runner.call_proc('@Shutdown',
                                    columns,
                                    shutdown_params,
                                    check_status=False)
        print response
Ejemplo n.º 12
0
def basicCheck(runner):
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['OVERVIEW'])

    # Convert @SystemInformation results to objects.
    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(tuple[0], tuple[1], tuple[2])

    # get current version and root directory from an arbitrary node
    host = hosts.hosts_by_id.itervalues().next()
    fullClusterSize = int(host.fullclustersize)
    if len(hosts.hosts_by_id) < fullClusterSize:
        runner.abort(
            "Current cluster needs %d more node(s) to achieve full K-safety. In-service upgrade is not recommended in partial K-safety cluster."
            % (fullClusterSize - len(hosts.hosts_by_id)))

    if fullClusterSize % 2 == 1 and runner.opts.newNode is None:
        runner.abort(
            "The cluster has odd number of nodes, plan_upgrade needs an extra node to generate the instructions"
        )

    host = hosts.hosts_by_id.itervalues().next()
    currentVersion = host.version

    # get k-factor from @SystemInformation
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['DEPLOYMENT'])
    for tuple in response.table(0).tuples():
        if tuple[0] == 'kfactor':
            kfactor = tuple[1]
            break

    # sanity check, in case of nodes number or K-factor is less than required
    # K = 0, abort with error message
    if kfactor == 0:
        runner.abort(
            "Current cluster doesn't have duplicate partitions to perform online upgrade. K-factor: %d"
            % kfactor)

    # N = 1, abort with error message
    if fullClusterSize == 1:
        runner.abort(
            "Current cluster doesn't have enough node to perform online upgrade, at least two nodes are required"
        )

    response = checkstats.get_stats(runner, "DRROLE")
    largestClusterId = -1
    for tuple in response.table(0).tuples():
        remote_cluster_id = tuple[2]
        if remote_cluster_id > largestClusterId:
            largestClusterId = remote_cluster_id

    # Check the existence of voltdb root path and new kit on all the existing nodes
    response = runner.call_proc('@CheckUpgradePlanNT', [
        VOLT.FastSerializer.VOLTTYPE_STRING,
        VOLT.FastSerializer.VOLTTYPE_STRING
    ], [runner.opts.newKit, runner.opts.newRoot])
    error = False
    for tuple in response.table(0).tuples():
        hostId = tuple[0]
        result = tuple[1]
        if result != 'Success':
            error = True
            host = hosts.hosts_by_id[hostId]
            if host is None:
                runner.abort('@CheckUpgradePlanNT returns a host id ' +
                             hostId + " that doesn't belong to the cluster.")
            print 'Pre-upgrade check fails on host ' + getHostnameOrIp(
                host) + " with the cause: " + result

    if error:
        runner.abort("Failed to pass pre-upgrade check. Abort. ")
    print '[1/4] Passed new VoltDB kit version check.'
    print '[2/4] Passed new VoltDB root path existence check.'

    return hosts, kfactor, largestClusterId
Ejemplo n.º 13
0
def getClusterInfo(runner):
    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['OVERVIEW'])

    # Convert @SystemInformation results to objects.
    hosts = Hosts(runner.abort)
    for tuple in response.table(0).tuples():
        hosts.update(tuple[0], tuple[1], tuple[2])

    # get current version and root directory from an arbitrary node
    host = hosts.hosts_by_id.itervalues().next()
    clusterId = host.clusterid
    fullClusterSize = int(host.fullclustersize)
    version = host.version
    uptime = host.uptime

    response = runner.call_proc('@SystemInformation',
                                [VOLT.FastSerializer.VOLTTYPE_STRING],
                                ['DEPLOYMENT'])
    for tuple in response.table(0).tuples():
        if tuple[0] == 'kfactor':
            kfactor = tuple[1]
            break

    cluster = Cluster(int(clusterId), version, int(kfactor),
                      int(fullClusterSize), uptime)
    for hostId, hostInfo in hosts.hosts_by_id.items():
        cluster.add_member(hostId, hostInfo.hostname)

    # number of live clients connect to the cluster
    response = checkstats.get_stats(runner, "LIVECLIENTS")
    liveclients = 0
    for tuple in response.table(0).tuples():
        isAdmin = tuple[5]
        # exclude admin connections
        if isAdmin != 1:
            liveclients += 1
    cluster.update_live_clients(liveclients)

    if runner.opts.dr:
        # Do we have any ongoing DR conversation?
        response = checkstats.get_stats(runner, "DRROLE")
        for tuple in response.table(0).tuples():
            role = tuple[0]
            status = tuple[1]
            remote_cluster_id = tuple[2]
            if (remote_cluster_id != -1):
                cluster.add_remote_cluster(remote_cluster_id, status, role)

        response = checkstats.get_stats(runner, "DRPRODUCER")
        for tuple in response.table(0).tuples():
            host_name = tuple[2]
            remote_cluster_id = tuple[4]
            last_queued_drid = tuple[10]
            last_queued_ts = tuple[12]
            last_acked_ts = tuple[13]
            if last_queued_drid == -1:
                delay = 0
            else:
                delay = (last_queued_ts - last_acked_ts).total_seconds()
            cluster.get_remote_cluster(
                remote_cluster_id).update_producer_latency(
                    host_name, remote_cluster_id, delay)

        # Find remote topology through drconsumer stats
        response = checkstats.get_stats(runner, "DRCONSUMER")
        for tuple in response.table(1).tuples():
            remote_cluster_id = tuple[4]
            covering_host = tuple[7]
            last_applied_ts = tuple[9]
            if covering_host != '':
                cluster.get_remote_cluster(
                    remote_cluster_id).add_remote_member(covering_host)

    return cluster