def TestNodeMigrate(node, node2): """gnt-node migrate""" if qa_utils.GetNodeInstances(node2, secondaries=False): raise qa_error.UnusableNodeError( "Secondary node has at least one" " primary instance. This test requires" " it to have no primary instances.") # Migrate to secondary node AssertCommand(["gnt-node", "migrate", "-f", node.primary]) # ... and back again. AssertCommand(["gnt-node", "migrate", "-f", node2.primary])
def TestInstanceReinstall(instance): """gnt-instance reinstall""" if instance.disk_template == constants.DT_DISKLESS: print qa_utils.FormatInfo("Test not supported for diskless instances") return AssertCommand(["gnt-instance", "reinstall", "-f", instance.name]) # Test with non-existant OS definition AssertCommand([ "gnt-instance", "reinstall", "-f", "--os-type=NonExistantOsForQa", instance.name ], fail=True)
def TestJobCancellation(): """gnt-job cancel""" # The delay used for the first command should be large enough for the next # command and the cancellation command to complete before the first job is # done. The second delay should be small enough that not too much time is # spend waiting in the case of a failed cancel and a running command. FIRST_COMMAND_DELAY = 10.0 AssertCommand(["gnt-debug", "delay", "--submit", str(FIRST_COMMAND_DELAY)]) SECOND_COMMAND_DELAY = 1.0 master = qa_config.GetMasterNode() # Forcing tty usage does not work on buildbot, so force all output of this # command to be redirected to stdout job_id_output = GetCommandOutput( master.primary, "gnt-debug delay --submit %s 2>&1" % SECOND_COMMAND_DELAY) possible_job_ids = re.findall("JobID: ([0-9]+)", job_id_output) if len(possible_job_ids) != 1: raise qa_error.Error( "Cannot parse gnt-debug delay output to find job id") job_id = possible_job_ids[0] AssertCommand(["gnt-job", "cancel", job_id]) # Now wait until the second job finishes, and expect the watch to fail due to # job cancellation AssertCommand(["gnt-job", "watch", job_id], fail=True) # Then check for job cancellation job_status = _GetJobStatus(job_id) if job_status != constants.JOB_STATUS_CANCELED: # Try and see if the job is being cancelled, and wait until the status # changes or we hit a timeout if job_status == constants.JOB_STATUS_CANCELING: retry_fn = functools.partial(_RetryingFetchJobStatus, constants.JOB_STATUS_CANCELING, job_id) try: job_status = retry.Retry(retry_fn, 2.0, 2 * FIRST_COMMAND_DELAY) except retry.RetryTimeout: # The job status remains the same pass if job_status != constants.JOB_STATUS_CANCELED: raise qa_error.Error("Job was not successfully cancelled, status " "found: %s" % job_status)
def TestClusterMasterFailover(): """gnt-cluster master-failover""" master = qa_config.GetMasterNode() failovermaster = qa_config.AcquireNode(exclude=master) cmd = ["gnt-cluster", "master-failover"] node_list_cmd = ["gnt-node", "list"] try: AssertCommand(cmd, node=failovermaster) AssertCommand(node_list_cmd, node=failovermaster) # Back to original master node AssertCommand(cmd, node=master) AssertCommand(node_list_cmd, node=master) finally: failovermaster.Release()
def TestInstanceFailover(instance): """gnt-instance failover""" if not IsFailoverSupported(instance): print qa_utils.FormatInfo("Instance doesn't support failover, skipping" " test") return cmd = ["gnt-instance", "failover", "--force", instance.name] # failover ... AssertCommand(cmd) qa_utils.RunInstanceCheck(instance, True) # ... and back AssertCommand(cmd)
def TestAdHocReasonRateLimit(): """Tests that ad-hoc rate limiting using --reason="rate-limit:n:..." works. """ # Make sure our test is not constrained by "max-running-jobs" # (simply set it to the default). AssertCommand(["gnt-cluster", "modify", "--max-running-jobs=20"]) AssertCommand(["gnt-cluster", "modify", "--max-tracked-jobs=25"]) # Only the first 2 jobs must be scheduled. jid1 = int( stdout_of([ "gnt-debug", "delay", "--print-jobid", "--submit", "--reason=rate-limit:2:hello", "20", ])) jid2 = int( stdout_of([ "gnt-debug", "delay", "--print-jobid", "--submit", "--reason=rate-limit:2:hello", "20", ])) jid3 = int( stdout_of([ "gnt-debug", "delay", "--print-jobid", "--submit", "--reason=rate-limit:2:hello", "20", ])) time.sleep(0.1) # give the scheduler some time to notice AssertIn(GetJobStatus(jid1), ["running", "waiting"], msg="Job should not be rate-limited") AssertIn(GetJobStatus(jid2), ["running", "waiting"], msg="Job should not be rate-limited") AssertEqual(GetJobStatus(jid3), "queued", msg="Job should be rate-limited") # Clean up. KillWaitJobs([jid1, jid2, jid3])
def _SetupTempOs(node, dirname, variant, valid): """Creates a temporary OS definition on the given node. """ sq = utils.ShellQuoteArgs parts = [ sq(["rm", "-rf", dirname]), sq(["mkdir", "-p", dirname]), sq(["cd", dirname]), sq(["ln", "-fs", "/bin/true", "export"]), sq(["ln", "-fs", "/bin/true", "import"]), sq(["ln", "-fs", "/bin/true", "rename"]), sq(["ln", "-fs", "/bin/true", "verify"]), ] if valid: parts.append(sq(["ln", "-fs", "/bin/true", "create"])) parts.append(sq(["echo", str(constants.OS_API_V20)]) + " >ganeti_api_version") parts.append(sq(["echo", variant]) + " >variants.list") parts.append(sq(["echo", "funny this is funny"]) + " >parameters.list") cmd = " && ".join(parts) print(qa_utils.FormatInfo("Setting up %s with %s OS definition" % (node.primary, ["an invalid", "a valid"][int(valid)]))) AssertCommand(cmd, node=node)
def ModifyGroupSshPort(ipt_rules, group, nodes, ssh_port): """Modifies the node group settings and sets up iptable rules. For each pair of nodes add two rules that affect SSH connections from one to the other one. The first one redirects port 22 to some unused port so that connecting through 22 fails. The second redirects port `ssh_port` to port 22. Together this results in master seeing the SSH daemons on the nodes on `ssh_port` instead of 22. """ default_ssh_port = netutils.GetDaemonPort(constants.SSH) all_nodes = qa_config.get("nodes") AssertCommand(["gnt-group", "modify", "--node-parameters=ssh_port=" + str(ssh_port), group]) for node in nodes: ipt_rules.RedirectPort(node.primary, "localhost", default_ssh_port, 65535) ipt_rules.RedirectPort(node.primary, "localhost", ssh_port, default_ssh_port) for node2 in all_nodes: ipt_rules.RedirectPort(node2.primary, node.primary, default_ssh_port, 65535) ipt_rules.RedirectPort(node2.primary, node.primary, ssh_port, default_ssh_port)
def _AssertDrainFile(node, **kwargs): """Checks for the queue drain file. """ AssertCommand(["test", "-f", _NodeQueueDrainFile(node)], node=node, **kwargs)
def TestInstanceConsecutiveFailures(instance): """Test five consecutive instance failures. """ inst_name = qa_utils.ResolveInstanceName(instance.name) inst_was_running = bool(_InstanceRunning(inst_name)) _ResetWatcherDaemon() for should_start in ([True] * 5) + [False]: _ShutdownInstance(inst_name) RunWatcherDaemon() time.sleep(5) if bool(_InstanceRunning(inst_name)) != should_start: if should_start: msg = "Instance not started when it should" else: msg = "Instance started when it shouldn't" raise qa_error.Error(msg) AssertCommand(["gnt-instance", "info", inst_name]) if inst_was_running: _StartInstance(inst_name)
def TestExclStorSingleNode(node): """gnt-node add/modify cannot change the exclusive_storage flag. """ for action in ["add", "modify"]: for value in (True, False, "default"): AssertCommand(_BuildSetESCmd(action, value, node.primary), fail=True)
def TestIcmpPing(): """ICMP ping each node. """ nodes = qa_config.get("nodes") pingprimary = pingsecondary = "fping" if qa_config.get("primary_ip_version") == 6: pingprimary = "fping6" pricmd = [pingprimary, "-e"] seccmd = [pingsecondary, "-e"] for i in nodes: pricmd.append(i.primary) if i.secondary: seccmd.append(i.secondary) pristr = utils.ShellQuoteArgs(pricmd) if seccmd: cmdall = "%s && %s" % (pristr, utils.ShellQuoteArgs(seccmd)) else: cmdall = pristr for node in nodes: AssertCommand(cmdall, node=node)
def NodeAdd(node, readd=False, group=None): if not readd and node.added: raise qa_error.Error("Node %s already in cluster" % node.primary) elif readd and not node.added: raise qa_error.Error("Node %s not yet in cluster" % node.primary) cmd = ["gnt-node", "add", "--no-ssh-key-check"] if node.secondary: cmd.append("--secondary-ip=%s" % node.secondary) if readd: cmd.append("--readd") if group is not None: cmd.extend(["--node-group", group]) if not qa_config.GetModifySshSetup(): cmd.append("--no-node-setup") cmd.append(node.primary) AssertCommand(cmd) if readd: assert node.added else: node.MarkAdded()
def AssertClusterVerify(fail=False, errors=None, warnings=None, no_warnings=None): """Run cluster-verify and check the result, ignoring warnings by default. @type fail: bool @param fail: if cluster-verify is expected to fail instead of succeeding. @type errors: list of tuples @param errors: List of CV_XXX errors that are expected; if specified, all the errors listed must appear in cluster-verify output. A non-empty value implies C{fail=True}. @type warnings: list of tuples @param warnings: List of CV_XXX warnings that are expected to be raised; if specified, all the errors listed must appear in cluster-verify output. @type no_warnings: list of tuples @param no_warnings: List of CV_XXX warnings that we expect NOT to be raised. """ cvcmd = "gnt-cluster verify" mnode = qa_config.GetMasterNode() if errors or warnings or no_warnings: cvout = GetCommandOutput(mnode.primary, cvcmd + " --error-codes", fail=(fail or errors)) print cvout (act_errs, act_warns) = _GetCVErrorCodes(cvout) if errors: _CheckVerifyErrors(act_errs, errors, "error") if warnings: _CheckVerifyErrors(act_warns, warnings, "warning") if no_warnings: _CheckVerifyNoWarnings(act_warns, no_warnings) else: AssertCommand(cvcmd, fail=fail, node=mnode)
def TestLiveRepair(): """Test node evacuate failover upon diagnosis. """ _SetUp('live-repair') n = random.randint(10000, 99999) node = qa_config.AcquireNode(exclude=qa_config.GetMasterNode()) UploadData( node.primary, 'echo \'' + serializer.DumpJson({ "status": "live-repair", "command": "repair", "details": str(n) }).strip() + '\'', 0755, '/etc/ganeti/node-diagnose-commands/live-repair') UploadData( node.primary, """#!/usr/bin/python import sys import json n = json.loads(sys.stdin.read())['details'] with open('/tmp/' + n, 'w') as f: f.write(n) print 'file written' """, 0755, '/etc/ganeti/node-repair-commands/repair') _AssertRepairCommand() tag = _AssertRepairTagAddition(node) if str(n) != AssertCommand(["cat", "/tmp/" + str(n)], node=node)[1]: raise qa_error.Error('Repair command was unsuccessful') node.Release() _TearDown(node, tag, [ '/etc/ganeti/node-diagnose-commands/live-repair', '/etc/ganeti/node-repair-commands/repair' ], False)
def AppendRule(self, node, chain, rule, table="filter"): """Appends an `iptables` rule to a given node """ AssertCommand(["iptables", "-t", table, "-A", chain] + rule + ["-m", "comment", "--comment", self.marker], node=node) self.AddNode(node)
def TestGanetiCommands(): """Test availibility of Ganeti commands. """ cmds = ( ["gnt-backup", "--version"], ["gnt-cluster", "--version"], ["gnt-debug", "--version"], ["gnt-instance", "--version"], ["gnt-job", "--version"], ["gnt-network", "--version"], ["gnt-node", "--version"], ["gnt-os", "--version"], ["gnt-storage", "--version"], ["gnt-filter", "--version"], ["ganeti-noded", "--version"], ["ganeti-rapi", "--version"], ["ganeti-watcher", "--version"], ["ganeti-confd", "--version"], ["ganeti-luxid", "--version"], ["ganeti-wconfd", "--version"], ) cmd = " && ".join([utils.ShellQuoteArgs(i) for i in cmds]) for node in qa_config.get("nodes"): AssertCommand(cmd, node=node)
def TestFilterWatermark(): """Tests that the filter watermark is set correctly""" # Check what the current highest job ID is highest_jid1 = int( stdout_of(["gnt-debug", "delay", "--print-jobid", "0.01"])) # Add the filter; this sets the watermark uuid = stdout_of(["gnt-filter", "add"]) # Check what the current highest job ID is highest_jid2 = int( stdout_of(["gnt-debug", "delay", "--print-jobid", "0.01"])) info_out = stdout_of(["gnt-filter", "info", uuid]) # The second line of gnt-filter info shows the watermark. watermark = int( info_out.split('\n')[1].strip().lower().split("watermark: ")[1]) # The atermark must be at least as high as the JID of the job we started # just before the creation, and must be lower than the JID of any job # created afterwards. assert highest_jid1 <= watermark < highest_jid2, \ "Watermark not in range: %d <= %d < %d" % (highest_jid1, watermark, highest_jid2) # Clean up. AssertCommand(["gnt-filter", "delete", uuid])
def _TestInstanceUserDownKvm(instance, master): def _StopKVMInstance(): AssertCommand("pkill -f \"kvm -name %s\"" % instance.name, node=primary) time.sleep(5) AssertCommand( ["gnt-instance", "modify", "-H", "user_shutdown=true", instance.name]) # The instance needs to reboot not because the 'user_shutdown' # parameter was modified but because the KVM daemon need to be # started, given that the instance was first created with user # shutdown disabled. AssertCommand(["gnt-instance", "reboot", instance.name]) primary = _GetInstanceField(instance.name, "pnode") _TestInstanceUserDown(instance, master, _StopKVMInstance)
def TestInstanceConvertDiskToPlain(instance, inodes): """gnt-instance modify -t""" name = instance.name template = instance.disk_template if template != constants.DT_DRBD8: print qa_utils.FormatInfo( "Unsupported template %s, skipping conversion" " test" % template) return assert len(inodes) == 2 AssertCommand(["gnt-instance", "modify", "-t", constants.DT_PLAIN, name]) AssertCommand([ "gnt-instance", "modify", "-t", constants.DT_DRBD8, "-n", inodes[1].primary, name ])
def TestInstanceExport(instance, node): """gnt-backup export -n ...""" name = instance.name # Export does not work for file-based templates, thus we skip the test if instance.disk_template in [constants.DT_FILE, constants.DT_SHARED_FILE]: return AssertCommand(["gnt-backup", "export", "-n", node.primary, name]) return qa_utils.ResolveInstanceName(name)
def TestBackupList(expnode): """gnt-backup list""" AssertCommand(["gnt-backup", "list", "--node=%s" % expnode.primary]) qa_utils.GenericQueryTest("gnt-backup", query.EXPORT_FIELDS.keys(), namefield=None, test_unknown=False)
def TestInstanceReboot(instance): """gnt-instance reboot""" options = qa_config.get("options", {}) reboot_types = options.get("reboot-types", constants.REBOOT_TYPES) name = instance.name for rtype in reboot_types: AssertCommand(["gnt-instance", "reboot", "--type=%s" % rtype, name]) AssertCommand(["gnt-instance", "shutdown", name]) qa_utils.RunInstanceCheck(instance, False) AssertCommand(["gnt-instance", "reboot", name]) master = qa_config.GetMasterNode() cmd = ["gnt-instance", "list", "--no-headers", "-o", "status", name] result_output = qa_utils.GetCommandOutput(master.primary, utils.ShellQuoteArgs(cmd)) AssertEqual(result_output.strip(), constants.INSTST_RUNNING)
def TestNodeModify(node): """gnt-node modify""" default_pool_size = 10 nodes = qa_config.GetAllNodes() test_pool_size = len(nodes) - 1 # Reduce the number of master candidates, because otherwise all # subsequent 'gnt-cluster verify' commands fail due to not enough # master candidates. AssertCommand( ["gnt-cluster", "modify", "--candidate-pool-size=%s" % test_pool_size]) # make sure enough master candidates will be available by disabling the # master candidate role first with --auto-promote AssertCommand([ "gnt-node", "modify", "--master-candidate=no", "--auto-promote", node.primary ]) # now it's save to force-remove the master candidate role for flag in ["master-candidate", "drained", "offline"]: for value in ["yes", "no"]: AssertCommand([ "gnt-node", "modify", "--force", "--%s=%s" % (flag, value), node.primary ]) AssertCommand(["gnt-cluster", "verify"]) AssertCommand( ["gnt-node", "modify", "--master-candidate=yes", node.primary]) # Test setting secondary IP address AssertCommand([ "gnt-node", "modify", "--secondary-ip=%s" % node.secondary, node.primary ]) AssertRedirectedCommand(["gnt-cluster", "verify"]) AssertCommand([ "gnt-cluster", "modify", "--candidate-pool-size=%s" % default_pool_size ]) # For test clusters with more nodes than the default pool size, # we now have too many master candidates. To readjust to the original # size, manually demote all nodes and rely on auto-promotion to adjust. if len(nodes) > default_pool_size: master = qa_config.GetMasterNode() for n in nodes: if n.primary != master.primary: AssertCommand([ "gnt-node", "modify", "--master-candidate=no", "--auto-promote", n.primary ])
def TestUpgrade(): """Test gnt-cluster upgrade. This tests the 'gnt-cluster upgrade' command by flipping between the current and a different version of Ganeti. To also recover subtile points in the configuration up/down grades, instances are left over both upgrades. """ this_version = qa_config.get("dir-version") other_version = qa_config.get("other-dir-version") if this_version is None or other_version is None: print qa_utils.FormatInfo("Test not run, as versions not specified") return inst_creates = [] upgrade_instances = qa_config.get("upgrade-instances", []) live_instances = [] for (test_name, templ, cf, n) in qa_instance.available_instance_tests: if (qa_config.TestEnabled(test_name) and qa_config.IsTemplateSupported(templ) and templ in upgrade_instances): inst_creates.append((cf, n)) for (cf, n) in inst_creates: nodes = qa_config.AcquireManyNodes(n) live_instances.append(cf(nodes)) AssertCommand(["gnt-cluster", "upgrade", "--to", other_version]) AssertCommand(["gnt-cluster", "verify"]) for instance in live_instances: qa_instance.TestInstanceRemove(instance) instance.Release() live_instances = [] for (cf, n) in inst_creates: nodes = qa_config.AcquireManyNodes(n) live_instances.append(cf(nodes)) AssertCommand(["gnt-cluster", "upgrade", "--to", this_version]) AssertCommand(["gnt-cluster", "verify"]) for instance in live_instances: qa_instance.TestInstanceRemove(instance) instance.Release()
def KillWaitJobs(job_ids): """Kills the lists of jobs, then watches them so that when this function returns we can be sure the jobs are all done. This should be called at the end of tests that started jobs with --submit so that following tests have an empty job queue. @type job_ids: list of int @param job_ids: the lists of job IDs to kill and wait for """ # We use fail=None to ignore the exit code, since it can be non-zero # if the job is already terminated. for jid in job_ids: AssertCommand(["gnt-job", "cancel", "--kill", "--yes-do-it", str(jid)], fail=None) for jid in job_ids: AssertCommand(["gnt-job", "watch", str(jid)], fail=None)
def TestNodeEvacuate(node, node2): """gnt-node evacuate""" node3 = qa_config.AcquireNode(exclude=[node, node2]) try: if qa_utils.GetNodeInstances(node3, secondaries=True): raise qa_error.UnusableNodeError("Evacuation node has at least one" " secondary instance. This test requires" " it to have no secondary instances.") # Evacuate all secondary instances AssertCommand(["gnt-node", "evacuate", "-f", "--new-secondary=%s" % node3.primary, node2.primary]) # ... and back again. AssertCommand(["gnt-node", "evacuate", "-f", "--new-secondary=%s" % node2.primary, node3.primary]) finally: node3.Release()
def TestFilterReject(): """Tests that the REJECT filter does reject new jobs and that the "jobid" predicate works. """ # Add a filter that rejects all new jobs. uuid = stdout_of([ "gnt-filter", "add", '--predicates=[["jobid", [">", "id", "watermark"]]]', "--action=REJECT", ]) # Newly queued jobs must now fail. AssertCommand(["gnt-debug", "delay", "0.01"], fail=True) # Clean up. AssertCommand(["gnt-filter", "delete", uuid])
def _ResetWatcherDaemon(): """Removes the watcher daemon's state file. """ path = \ qa_utils.MakeNodePath(qa_config.GetMasterNode(), pathutils.WATCHER_GROUP_STATE_FILE % "*-*-*-*") AssertCommand(["bash", "-c", "rm -vf %s" % path])
def TestInstanceRenameAndBack(rename_source, rename_target): """gnt-instance rename This must leave the instance with the original name, not the target name. """ CheckSsconfInstanceList(rename_source) # first do a rename to a different actual name, expecting it to fail qa_utils.AddToEtcHosts(["meeeeh-not-exists", rename_target]) try: AssertCommand(["gnt-instance", "rename", rename_source, rename_target], fail=True) CheckSsconfInstanceList(rename_source) finally: qa_utils.RemoveFromEtcHosts(["meeeeh-not-exists", rename_target]) info = GetInstanceInfo(rename_source) # Check instance volume tags correctly updated. Note that this check is lvm # specific, so we skip it for non-lvm-based instances. # FIXME: This will need updating when instances will be able to have # different disks living on storage pools with etherogeneous storage types. # FIXME: This check should be put inside the disk/storage class themselves, # rather than explicitly called here. if info["storage-type"] == constants.ST_LVM_VG: # In the lvm world we can check for tags on the logical volume tags_cmd = ("lvs -o tags --noheadings %s | grep " % (" ".join(info["volumes"]), )) else: # Other storage types don't have tags, so we use an always failing command, # to make sure it never gets executed tags_cmd = "false" # and now rename instance to rename_target... AssertCommand(["gnt-instance", "rename", rename_source, rename_target]) CheckSsconfInstanceList(rename_target) qa_utils.RunInstanceCheck(rename_source, False) qa_utils.RunInstanceCheck(rename_target, False) # NOTE: tags might not be the exactly as the instance name, due to # charset restrictions; hence the test might be flaky if (rename_source != rename_target and info["storage-type"] == constants.ST_LVM_VG): for node in info["nodes"]: AssertCommand(tags_cmd + rename_source, node=node, fail=True) AssertCommand(tags_cmd + rename_target, node=node, fail=False) # and back AssertCommand(["gnt-instance", "rename", rename_target, rename_source]) CheckSsconfInstanceList(rename_source) qa_utils.RunInstanceCheck(rename_target, False) if (rename_source != rename_target and info["storage-type"] == constants.ST_LVM_VG): for node in info["nodes"]: AssertCommand(tags_cmd + rename_source, node=node, fail=False) AssertCommand(tags_cmd + rename_target, node=node, fail=True)