def generateDRResetCommand(runner, survivor, victim, clusterIds, files, step): command = "" if len(clusterIds) == 1: command = 'voltadmin dr reset --cluster=%s -H %s:%d --force\n' % (survivor.clusterid, getHostnameOrIp(victim), victim.adminport) else: remoteTopo = dict() # Find remote covering host through drconsumer stats, one for each remote cluster response = checkstats.get_stats(runner, "DRCONSUMER") for tuple in response.table(1).tuples(): remote_cluster_id = tuple[4] covering_host = tuple[7] last_applied_ts = tuple[9] if covering_host != '': if remote_cluster_id not in remoteTopo: remoteTopo[remote_cluster_id] = covering_host # assume remote cluster use the same admin port for clusterId, covering_host in remoteTopo.items(): command += 'voltadmin dr reset --cluster=%s -H %s:%d --force\n' % (survivor.clusterid, covering_host.split(":")[0], survivor.adminport) for clusterId in range(1, 127): if clusterId not in clusterIds: break command += 'voltadmin dr reset --cluster=%s -H %s:%d --force' % (survivor.clusterid, getHostnameOrIp(victim), victim.adminport) writeCommands(files[getKey(survivor)], 'Step %d: run dr reset command to tell other clusters to stop generating binary logs for the origin cluster' % step, command)
def generateDRResetCommand(runner, survivor, victim, clusterIds, files, step): command = "" if len(clusterIds) == 1: command = 'voltadmin dr reset --cluster=%s -H %s:%d --force\n' % ( survivor.clusterid, getHostnameOrIp(victim), victim.adminport) else: remoteTopo = dict() # Find remote covering host through drconsumer stats, one for each remote cluster response = checkstats.get_stats(runner, "DRCONSUMER") for tuple in response.table(1).tuples(): remote_cluster_id = tuple[4] covering_host = tuple[7] last_applied_ts = tuple[9] if covering_host != '': if remote_cluster_id not in remoteTopo: remoteTopo[remote_cluster_id] = covering_host # assume remote cluster use the same admin port for clusterId, covering_host in list(remoteTopo.items()): command += 'voltadmin dr reset --cluster=%s -H %s:%d --force\n' % ( survivor.clusterid, covering_host.split(":")[0], survivor.adminport) for clusterId in range(1, 127): if clusterId not in clusterIds: break command += 'voltadmin dr reset --cluster=%s -H %s:%d --force' % ( survivor.clusterid, getHostnameOrIp(victim), victim.adminport) writeCommands( files[getKey(survivor)], 'Step %d: run dr reset command to tell other clusters to stop generating binary logs for the origin cluster' % step, command)
def getRemoteClusterIds(runner): response = checkstats.get_stats(runner, "DRROLE") clusterIds = [] for tuple in response.table(0).tuples(): remote_cluster_id = tuple[2] if remote_cluster_id != -1: clusterIds.append(remote_cluster_id) return clusterIds
def createDeploymentForNewCluster(runner, xmlString, clusterIds, drSource, new_cluster_deploy, post_upgrade_deploy): et = ElementTree.ElementTree(ElementTree.fromstring(xmlString)) hasAbsPath = checkAbsPaths(runner, et.getroot(), new_cluster_deploy) # Prefer not to use 0 as the cluster Id for clusterId in range(1, 127): if clusterId not in clusterIds: break # since we check the existence of DR tag when generating deployment file for origin cluster, it's safe to skip it here dr = et.getroot().find('./dr') dr.attrib['id'] = str(clusterId) connection = dr.find('./connection') if connection is None: connection = ElementTree.Element('connection') dr.append(connection) connection.attrib['enabled'] = 'true' if drSource is not None: # for stand-alone cluster if 'source' in connection.attrib and connection.attrib['source'] != '': connection.attrib['source'] += ',' + drSource else: connection.attrib['source'] = drSource else: # for multi-cluster if 'source' not in connection.attrib: # Connect to a random covering host response = checkstats.get_stats(runner, "DRCONSUMER") for tuple in response.table(1).tuples(): covering_host = tuple[7] if covering_host != '': connection.attrib[ 'source'] = covering_host + ":" + host.drport # assume it use the same DR port break if 'source' not in connection.attrib: runner.abort( "Failed to generate the deployment file, no remote host found" ) else: # Don't change the source if it exists pass prettyprint(et.getroot()) et.write(new_cluster_deploy) # In single cluster case, create deployment file with disabled DR connection if post_upgrade_deploy is not None: connection.attrib['enabled'] = 'false' connection.attrib['source'] = '' et.write(post_upgrade_deploy) return hasAbsPath
def createDeploymentForNewCluster(runner, xmlString, clusterIds, drSource, new_cluster_deploy, post_upgrade_deploy): et = ElementTree.ElementTree(ElementTree.fromstring(xmlString)) hasAbsPath = checkAbsPaths(runner, et.getroot(), new_cluster_deploy) # Prefer not to use 0 as the cluster Id for clusterId in range(1, 127): if clusterId not in clusterIds: break # since we check the existence of DR tag when generating deployment file for origin cluster, it's safe to skip it here dr = et.getroot().find('./dr') dr.attrib['id'] = str(clusterId) connection = dr.find('./connection') if connection is None: connection = ElementTree.Element('connection') dr.append(connection) connection.attrib['enabled'] = 'true' if drSource is not None: # for stand-alone cluster if 'source' in connection.attrib and connection.attrib['source'] != '': connection.attrib['source'] += ',' + drSource else: connection.attrib['source'] = drSource else: # for multi-cluster if 'source' not in connection.attrib: # Connect to a random covering host response = checkstats.get_stats(runner, "DRCONSUMER") for tuple in response.table(1).tuples(): covering_host = tuple[7] if covering_host != '': connection.attrib['source'] = covering_host + ":" + host.drport # assume it use the same DR port break; if 'source' not in connection.attrib: runner.abort("Failed to generate the deployment file, no remote host found") else: # Don't change the source if it exists pass prettyprint(et.getroot()) et.write(new_cluster_deploy) # In single cluster case, create deployment file with disabled DR connection if post_upgrade_deploy is not None: connection.attrib['enabled'] = 'false' connection.attrib['source'] = '' et.write(post_upgrade_deploy) return hasAbsPath
def createDeploymentForNewCluster(runner, xmlString, largestClusterId, drSource, new_cluster_deploy): et = ElementTree.ElementTree(ElementTree.fromstring(xmlString)) clusterId = largestClusterId + 1 # cluster id ranges from 0 to 127 if clusterId > 127: clusterId = 0 # since we check the existence of DR tag when generating deployment file for origin cluster, it's safe to skip it here dr = et.getroot().find('./dr') dr.attrib['id'] = str(clusterId) connection = dr.find('./connection') if connection is None: connection = ElementTree.Element('connection') dr.append(connection) connection.attrib['enabled'] = 'true' if drSource is not None: # for stand-alone cluster if 'source' in connection.attrib: connection.attrib['source'] += ',' + drSource else: connection.attrib['source'] = drSource else: # for multi-cluster if 'source' not in connection.attrib: # Connect to a random covering host response = checkstats.get_stats(runner, "DRCONSUMER") for tuple in response.table(1).tuples(): covering_host = tuple[7] if covering_host != '': connection.attrib[ 'source'] = covering_host + ":" + host.drport # assume it use the same DR port break if 'source' not in connection.attrib: runner.abort( "Failed to generate the deployment file, no remote host found" ) else: # Don't change the source if it exists pass et.write(new_cluster_deploy)
def getClusterInfo(runner, available_hosts, clearHostCache): # raise execption when failed to connect response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW'], True, None, True) if response.response.status != 1: return None; if clearHostCache: available_hosts[:] = [] # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) for hostId, hostInfo in hosts.hosts_by_id.items(): if hostInfo.hostname not in available_hosts: available_hosts.append(hostInfo.hostname + ":" + str(hostInfo.clientport)) # get current version and root directory from an arbitrary node host = hosts.hosts_by_id.itervalues().next() # ClusterId in @SystemInformation is added in v7.2, so must check the version of target cluster to make it work properly. version = host.version versionStr = version.split('.') majorVersion = int(versionStr[0]) minorVersion = int(versionStr[1]) if majorVersion < RELEASE_MAJOR_VERSION or (majorVersion == RELEASE_MAJOR_VERSION and minorVersion < RELEASE_MINOR_VERSION): runner.abort("Only v7.2 or higher version of VoltDB supports this command. Target cluster running on v" + version + ".") clusterId = host.clusterid fullClusterSize = int(host.fullclustersize) uptime = host.uptime response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['DEPLOYMENT'], True, None, True) for tuple in response.table(0).tuples(): if tuple[0] == 'kfactor': kfactor = tuple[1] break cluster = Cluster(int(clusterId), version, int(kfactor), int(fullClusterSize), uptime) for hostId, hostInfo in hosts.hosts_by_id.items(): cluster.add_member(hostId, hostInfo.hostname) # number of live clients connect to the cluster try: response = checkstats.get_stats(runner, "LIVECLIENTS") except StatisticsProcedureException as e: runner.info(e.message) sys.exit(e.exitCode) liveclients = 0 for tuple in response.table(0).tuples(): isAdmin = tuple[5] # exclude admin connections if isAdmin != 1: liveclients += 1 cluster.update_live_clients(liveclients) if runner.opts.dr: # Do we have any ongoing DR conversation? try: response = checkstats.get_stats(runner, "DRROLE") except StatisticsProcedureException as e: runner.info(e.message) sys.exit(e.exitCode) for tuple in response.table(0).tuples(): role = tuple[0] status = tuple[1] remote_cluster_id = tuple[2] if (remote_cluster_id != -1): cluster.add_remote_cluster(remote_cluster_id, status, role) try: response = checkstats.get_stats(runner, "DRPRODUCER") except StatisticsProcedureException as e: runner.info(e.message) sys.exit(e.exitCode) for tuple in response.table(0).tuples(): host_name = tuple[2] remote_cluster_id = tuple[4] last_queued_drid = tuple[10] last_queued_ts = tuple[12] last_acked_ts = tuple[13] if last_queued_drid == -1: delay = 0 else: delay = (last_queued_ts - last_acked_ts).total_seconds() cluster.get_remote_cluster(remote_cluster_id).update_producer_latency(host_name, remote_cluster_id, delay) # Find remote topology through drconsumer stats try: response = checkstats.get_stats(runner, "DRCONSUMER") except StatisticsProcedureException as e: runner.info(e.message) sys.exit(e.exitCode) for tuple in response.table(1).tuples(): remote_cluster_id = tuple[4] covering_host = tuple[7] last_applied_ts = tuple[9] if covering_host != '': cluster.get_remote_cluster(remote_cluster_id).add_remote_member(covering_host) return cluster
def basicCheck(runner): response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # get current version and root directory from an arbitrary node host = next(iter(hosts.hosts_by_id.values())) fullClusterSize = int(host.fullclustersize) if len(hosts.hosts_by_id) < fullClusterSize: delta = fullClusterSize - len(hosts.hosts_by_id) runner.abort( "Current cluster needs %d more node%s to achieve full K-safety. Online upgrade is not supported in partial K-safety cluster." % (delta, "" if delta == 1 else "s")) if fullClusterSize % 2 == 1 and runner.opts.newNode is None: runner.abort( "The cluster has odd number of nodes, plan_upgrade needs an extra node to generate the instructions" ) if fullClusterSize % 2 == 0 and runner.opts.newNode is not None: runner.abort( "For even-numbered cluster, 2 parameters are expected (received 3)." ) if runner.opts.newNode is not None: result = checkNewNode(runner.opts.newNode) if result is not None: runner.abort("Failed to resolve host {0}:{1}.".format( runner.opts.newNode, result)) host = next(iter(hosts.hosts_by_id.values())) currentVersion = host.version # get k-factor from @SystemInformation response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['DEPLOYMENT']) for tuple in response.table(0).tuples(): if tuple[0] == 'kfactor': kfactor = tuple[1] break # sanity check, in case of nodes number or K-factor is less than required # K = 0, abort with error message if kfactor == 0: runner.abort( "Current cluster doesn't have duplicate partitions to perform online upgrade. K-factor: %d" % kfactor) # N = 1, abort with error message if fullClusterSize == 1: runner.abort( "Current cluster doesn't have enough nodes to perform online upgrade, at least two nodes are required" ) response = checkstats.get_stats(runner, "DRROLE") clusterIds = [] clusterIds.append(int(host.clusterid)) # add local cluster id first for tuple in response.table(0).tuples(): remote_cluster_id = tuple[2] if remote_cluster_id != -1: clusterIds.append( remote_cluster_id) # add remote cluster id if exists if len(clusterIds) == 127: runner.abort( "Failed to generate upgrade plan: number of connected cluster reaches the maximum limit (127)." ) # Check the existence of voltdb root path and new kit on all the existing nodes response = runner.call_proc('@CheckUpgradePlanNT', [ VOLT.FastSerializer.VOLTTYPE_STRING, VOLT.FastSerializer.VOLTTYPE_STRING ], [runner.opts.newKit, runner.opts.newRoot]) error = False warnings = "" for tuple in response.table(0).tuples(): hostId = tuple[0] result = tuple[1] warning = tuple[2] if result != 'Success': error = True host = hosts.hosts_by_id[hostId] if host is None: runner.abort('@CheckUpgradePlanNT returns a host id ' + hostId + " that doesn't belong to the cluster.") runner.error('Check failed on host ' + getHostnameOrIp(host) + " with the cause: " + result) if warning is not None: host = hosts.hosts_by_id[hostId] if host is None: runner.abort('@CheckUpgradePlanNT returns a host id ' + hostId + " that doesn't belong to the cluster.") warnings += 'On host ' + getHostnameOrIp( host) + ': \n' + warning + '\n' if error: runner.abort("Failed to pass pre-upgrade check. Abort. ") if warnings != "": runner.warning(warnings[:-1]) # get rid of last '\n' print('[1/4] Passed new VoltDB kit version check.') print('[2/4] Passed new VoltDB root path existence check.') return hosts, kfactor, clusterIds
def basicCheck(runner): response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # get current version and root directory from an arbitrary node host = hosts.hosts_by_id.itervalues().next(); fullClusterSize = int(host.fullclustersize) if len(hosts.hosts_by_id) < fullClusterSize: runner.abort("Current cluster needs %d more node(s) to achieve full K-safety. Online upgrade is not supported in partial K-safety cluster." % (fullClusterSize - len(hosts.hosts_by_id))) if fullClusterSize % 2 == 1 and runner.opts.newNode is None: runner.abort("The cluster has odd number of nodes, plan_upgrade needs an extra node to generate the instructions") if fullClusterSize % 2 == 0 and runner.opts.newNode is not None: runner.abort("For even-numbered cluster, 2 parameters are expected (received 3).") if runner.opts.newNode is not None: result = checkNewNode(runner.opts.newNode) if result is not None: runner.abort("Failed to resolve host {0}:{1}.".format(runner.opts.newNode, result)) host = hosts.hosts_by_id.itervalues().next(); currentVersion = host.version # get k-factor from @SystemInformation response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['DEPLOYMENT']) for tuple in response.table(0).tuples(): if tuple[0] == 'kfactor': kfactor = tuple[1] break # sanity check, in case of nodes number or K-factor is less than required # K = 0, abort with error message if kfactor == 0: runner.abort("Current cluster doesn't have duplicate partitions to perform online upgrade. K-factor: %d" % kfactor) # N = 1, abort with error message if fullClusterSize == 1: runner.abort("Current cluster doesn't have enough node to perform online upgrade, at least two nodes are required") response = checkstats.get_stats(runner, "DRROLE") clusterIds = [] clusterIds.append(int(host.clusterid)) # add local cluster id first for tuple in response.table(0).tuples(): remote_cluster_id = tuple[2] if remote_cluster_id != -1: clusterIds.append(remote_cluster_id) # add remote cluster id if exists if len(clusterIds) == 127: runner.abort("Failed to generate upgrade plan: number of connected cluster reaches the maximum limit (127).") # Check the existence of voltdb root path and new kit on all the existing nodes response = runner.call_proc('@CheckUpgradePlanNT', [VOLT.FastSerializer.VOLTTYPE_STRING, VOLT.FastSerializer.VOLTTYPE_STRING], [runner.opts.newKit, runner.opts.newRoot]) error = False warnings = "" for tuple in response.table(0).tuples(): hostId = tuple[0] result = tuple[1] warning = tuple[2] if result != 'Success': error = True host = hosts.hosts_by_id[hostId] if host is None: runner.abort('@CheckUpgradePlanNT returns a host id ' + hostId + " that doesn't belong to the cluster.") runner.error('Check failed on host ' + getHostnameOrIp(host) + " with the cause: " + result) if warning is not None: host = hosts.hosts_by_id[hostId] if host is None: runner.abort('@CheckUpgradePlanNT returns a host id ' + hostId + " that doesn't belong to the cluster.") warnings += 'On host ' + getHostnameOrIp(host) + ': \n' + warning + '\n' if error: runner.abort("Failed to pass pre-upgrade check. Abort. ") if warnings != "": runner.warning(warnings[:-1]) # get rid of last '\n' print '[1/4] Passed new VoltDB kit version check.' print '[2/4] Passed new VoltDB root path existence check.' return hosts, kfactor, clusterIds
def show_snapshots(runner): response = checkstats.get_stats(runner, "SnapshotSummary") print(response.table(0).format_table(caption = 'Snapshot Summary'))
def basicCheck(runner): response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # get current version and root directory from an arbitrary node host = hosts.hosts_by_id.itervalues().next() fullClusterSize = int(host.fullclustersize) if len(hosts.hosts_by_id) < fullClusterSize: runner.abort( "Current cluster needs %d more node(s) to achieve full K-safety. In-service upgrade is not recommended in partial K-safety cluster." % (fullClusterSize - len(hosts.hosts_by_id))) if fullClusterSize % 2 == 1 and runner.opts.newNode is None: runner.abort( "The cluster has odd number of nodes, plan_upgrade needs an extra node to generate the instructions" ) host = hosts.hosts_by_id.itervalues().next() currentVersion = host.version # get k-factor from @SystemInformation response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['DEPLOYMENT']) for tuple in response.table(0).tuples(): if tuple[0] == 'kfactor': kfactor = tuple[1] break # sanity check, in case of nodes number or K-factor is less than required # K = 0, abort with error message if kfactor == 0: runner.abort( "Current cluster doesn't have duplicate partitions to perform online upgrade. K-factor: %d" % kfactor) # N = 1, abort with error message if fullClusterSize == 1: runner.abort( "Current cluster doesn't have enough node to perform online upgrade, at least two nodes are required" ) response = checkstats.get_stats(runner, "DRROLE") largestClusterId = -1 for tuple in response.table(0).tuples(): remote_cluster_id = tuple[2] if remote_cluster_id > largestClusterId: largestClusterId = remote_cluster_id # Check the existence of voltdb root path and new kit on all the existing nodes response = runner.call_proc('@CheckUpgradePlanNT', [ VOLT.FastSerializer.VOLTTYPE_STRING, VOLT.FastSerializer.VOLTTYPE_STRING ], [runner.opts.newKit, runner.opts.newRoot]) error = False for tuple in response.table(0).tuples(): hostId = tuple[0] result = tuple[1] if result != 'Success': error = True host = hosts.hosts_by_id[hostId] if host is None: runner.abort('@CheckUpgradePlanNT returns a host id ' + hostId + " that doesn't belong to the cluster.") print 'Pre-upgrade check fails on host ' + getHostnameOrIp( host) + " with the cause: " + result if error: runner.abort("Failed to pass pre-upgrade check. Abort. ") print '[1/4] Passed new VoltDB kit version check.' print '[2/4] Passed new VoltDB root path existence check.' return hosts, kfactor, largestClusterId
def getClusterInfo(runner): response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['OVERVIEW']) # Convert @SystemInformation results to objects. hosts = Hosts(runner.abort) for tuple in response.table(0).tuples(): hosts.update(tuple[0], tuple[1], tuple[2]) # get current version and root directory from an arbitrary node host = hosts.hosts_by_id.itervalues().next() clusterId = host.clusterid fullClusterSize = int(host.fullclustersize) version = host.version uptime = host.uptime response = runner.call_proc('@SystemInformation', [VOLT.FastSerializer.VOLTTYPE_STRING], ['DEPLOYMENT']) for tuple in response.table(0).tuples(): if tuple[0] == 'kfactor': kfactor = tuple[1] break cluster = Cluster(int(clusterId), version, int(kfactor), int(fullClusterSize), uptime) for hostId, hostInfo in hosts.hosts_by_id.items(): cluster.add_member(hostId, hostInfo.hostname) # number of live clients connect to the cluster response = checkstats.get_stats(runner, "LIVECLIENTS") liveclients = 0 for tuple in response.table(0).tuples(): isAdmin = tuple[5] # exclude admin connections if isAdmin != 1: liveclients += 1 cluster.update_live_clients(liveclients) if runner.opts.dr: # Do we have any ongoing DR conversation? response = checkstats.get_stats(runner, "DRROLE") for tuple in response.table(0).tuples(): role = tuple[0] status = tuple[1] remote_cluster_id = tuple[2] if (remote_cluster_id != -1): cluster.add_remote_cluster(remote_cluster_id, status, role) response = checkstats.get_stats(runner, "DRPRODUCER") for tuple in response.table(0).tuples(): host_name = tuple[2] remote_cluster_id = tuple[4] last_queued_drid = tuple[10] last_queued_ts = tuple[12] last_acked_ts = tuple[13] if last_queued_drid == -1: delay = 0 else: delay = (last_queued_ts - last_acked_ts).total_seconds() cluster.get_remote_cluster( remote_cluster_id).update_producer_latency( host_name, remote_cluster_id, delay) # Find remote topology through drconsumer stats response = checkstats.get_stats(runner, "DRCONSUMER") for tuple in response.table(1).tuples(): remote_cluster_id = tuple[4] covering_host = tuple[7] last_applied_ts = tuple[9] if covering_host != '': cluster.get_remote_cluster( remote_cluster_id).add_remote_member(covering_host) return cluster