def GetLuxiClient(try_restart): """Tries to connect to the luxi daemon. @type try_restart: bool @param try_restart: Whether to attempt to restart the master daemon """ try: return cli.GetClient() except errors.OpPrereqError as err: # this is, from cli.GetClient, a not-master case raise NotMasterError("Not on master node (%s)" % err) except (rpcerr.NoMasterError, rpcerr.TimeoutError) as err: if not try_restart: raise logging.warning("Luxi daemon seems to be down (%s), trying to restart", err) if not utils.EnsureDaemon(constants.LUXID): raise errors.GenericError("Can't start the master daemon") # Retry the connection return cli.GetClient()
def _TestJobSubmission(opts): """Tests submitting jobs. """ ToStdout("Testing job submission") testdata = [ (0, 0, constants.OP_PRIO_LOWEST), (0, 0, constants.OP_PRIO_HIGHEST), ] for priority in (constants.OP_PRIO_SUBMIT_VALID | frozenset( [constants.OP_PRIO_LOWEST, constants.OP_PRIO_HIGHEST])): for offset in [-1, +1]: testdata.extend([ (0, 0, priority + offset), (3, 0, priority + offset), (0, 3, priority + offset), (4, 2, priority + offset), ]) for before, after, failpriority in testdata: ops = [] ops.extend([opcodes.OpTestDelay(duration=0) for _ in range(before)]) ops.append(opcodes.OpTestDelay(duration=0, priority=failpriority)) ops.extend([opcodes.OpTestDelay(duration=0) for _ in range(after)]) try: cl = cli.GetClient() cl.SubmitJob(ops) except errors.GenericError as err: if opts.debug: ToStdout("Ignoring error for 'wrong priority' test: %s", err) else: raise errors.OpExecError( "Submitting opcode with priority %s did not" " fail when it should (allowed are %s)" % (failpriority, constants.OP_PRIO_SUBMIT_VALID)) jobs = [ [ opcodes.OpTestDelay(duration=0), opcodes.OpTestDelay(duration=0, dry_run=False), opcodes.OpTestDelay(duration=0, dry_run=True) ], ops, ] try: cl = cli.GetClient() cl.SubmitManyJobs(jobs) except errors.GenericError as err: if opts.debug: ToStdout("Ignoring error for 'wrong priority' test: %s", err) else: raise errors.OpExecError( "Submitting manyjobs with an incorrect one" " did not fail when it should.") ToStdout("Job submission tests were successful")
def get_instance_nics(instance, logger): """Query Ganeti to a get the instance's NICs. Get instance's NICs from Ganeti configuration data. If running on master, query Ganeti via Ganeti CLI client. Otherwise, get the nics from Ganeti configuration file. @type instance: string @param instance: the name of the instance @rtype: List of dicts @return: Dictionary containing the instance's NICs. Each dictionary contains the following keys: 'network', 'ip', 'mac', 'mode', 'link' and 'firewall' """ try: client = cli.GetClient() fields = [ "nic.networks", "nic.ips", "nic.macs", "nic.modes", "nic.links", "tags" ] info = client.QueryInstances([instance], fields, use_locking=False) networks, ips, macs, modes, links, tags = info[0] nic_keys = ["network", "ip", "mac", "mode", "link"] nics = zip(networks, ips, macs, modes, links) nics = map(lambda x: dict(zip(nic_keys, x)), nics) except ganeti_errors.OpPrereqError: # Not running on master! Load the conf file raw_data = utils.ReadFile(constants.CLUSTER_CONF_FILE) config = serializer.LoadJson(raw_data) i = config["instances"][instance] nics = [] for nic in i["nics"]: params = nic.pop("nicparams") nic["mode"] = params["mode"] nic["link"] = params["link"] nics.append(nic) tags = i.get("tags", []) # Get firewall from instance Tags # Tags are of the form synnefo:network:N:firewall_mode for tag in tags: t = tag.split(":") if t[0:2] == ["synnefo", "network"]: if len(t) != 4: logger.error("Malformed synefo tag %s", tag) continue try: index = int(t[2]) nics[index]['firewall'] = t[3] except ValueError: logger.error("Malformed synnefo tag %s", tag) except IndexError: logger.error("Found tag %s for non-existent NIC %d", tag, index) return nics
def _TestJobDependency(opts): """Tests job dependencies. """ ToStdout("Testing job dependencies") try: cl = cli.GetClient() SubmitOpCode(opcodes.OpTestDelay(duration=0, depends=[(-1, None)]), cl=cl) except errors.GenericError, err: if opts.debug: ToStdout("Ignoring error for 'wrong dependencies' test: %s", err)
def GetLuxiClient(try_restart, query=False): """Tries to connect to the master daemon. @type try_restart: bool @param try_restart: Whether to attempt to restart the master daemon """ try: return cli.GetClient(query=query) except errors.OpPrereqError, err: # this is, from cli.GetClient, a not-master case raise NotMasterError("Not on master node (%s)" % err)
def __init__(self): """Constructor.""" self.url_opener = SimpleOpener() self._feed_buf = StringIO() self.nodes = [] self.instances = [] self.to_rem = [] self.queued_ops = [] self.opts = None self.queue_retry = False self.disk_count = self.disk_growth = self.disk_size = None self.hvp = self.bep = None self.ParseOptions() self.cl = cli.GetClient() self.GetState()
def GenericOpCodes(opts, args): """Send any opcode to the master. @param opts: the command line options selected by the user @type args: list @param args: should contain only one element, the path of the file with the opcode definition @rtype: int @return: the desired exit code """ cl = cli.GetClient() jex = cli.JobExecutor(cl=cl, verbose=opts.verbose, opts=opts) job_cnt = 0 op_cnt = 0 if opts.timing_stats: ToStdout("Loading...") for job_idx in range(opts.rep_job): for fname in args: # pylint: disable=W0142 op_data = simplejson.loads(utils.ReadFile(fname)) op_list = [opcodes.OpCode.LoadOpCode(val) for val in op_data] op_list = op_list * opts.rep_op jex.QueueJob("file %s/%d" % (fname, job_idx), *op_list) op_cnt += len(op_list) job_cnt += 1 if opts.timing_stats: t1 = time.time() ToStdout("Submitting...") jex.SubmitPending(each=opts.each) if opts.timing_stats: t2 = time.time() ToStdout("Executing...") jex.GetResults() if opts.timing_stats: t3 = time.time() ToStdout("C:op %4d" % op_cnt) ToStdout("C:job %4d" % job_cnt) ToStdout("T:submit %4.4f" % (t2 - t1)) ToStdout("T:exec %4.4f" % (t3 - t2)) ToStdout("T:total %4.4f" % (t3 - t1)) return 0
def _ProcessTestMessage(self, job_id, sockname, test, arg): """Handles a job queue test message. """ if test not in constants.JQT_ALL: raise errors.OpExecError("Received invalid test message %s" % test) sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) try: sock.settimeout(30.0) logging.debug("Connecting to %s", sockname) sock.connect(sockname) logging.debug("Checking status") jobdetails = cli.GetClient().QueryJobs([job_id], ["status"])[0] if not jobdetails: raise errors.OpExecError("Can't find job %s" % job_id) status = jobdetails[0] logging.debug("Status of job %s is %s", job_id, status) if test == constants.JQT_EXPANDNAMES: if status != constants.JOB_STATUS_WAITING: raise errors.OpExecError( "Job status while expanding names is '%s'," " not '%s' as expected" % (status, constants.JOB_STATUS_WAITING)) elif test in (constants.JQT_EXEC, constants.JQT_LOGMSG): if status != constants.JOB_STATUS_RUNNING: raise errors.OpExecError( "Job status while executing opcode is '%s'," " not '%s' as expected" % (status, constants.JOB_STATUS_RUNNING)) if test == constants.JQT_STARTMSG: logging.debug("Expecting %s test messages", arg) self._testmsgs = [] elif test == constants.JQT_LOGMSG: if len(self._testmsgs) != arg: raise errors.OpExecError( "Received %s test messages when %s are" " expected" % (len(self._testmsgs), arg)) finally: logging.debug("Closing socket") sock.close()
def TestJobqueue(opts, _): """Runs a few tests on the job queue. """ _TestJobSubmission(opts) _TestJobDependency(opts) (TM_SUCCESS, TM_MULTISUCCESS, TM_FAIL, TM_PARTFAIL) = range(4) TM_ALL = compat.UniqueFrozenset([ TM_SUCCESS, TM_MULTISUCCESS, TM_FAIL, TM_PARTFAIL, ]) for mode in TM_ALL: test_messages = [ "Testing mode %s" % mode, "Hello World", "A", "", "B", "Foo|bar|baz", utils.TimestampForFilename(), ] fail = mode in (TM_FAIL, TM_PARTFAIL) if mode == TM_PARTFAIL: ToStdout("Testing partial job failure") ops = [ opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True, log_messages=test_messages, fail=False), opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True, log_messages=test_messages, fail=False), opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True, log_messages=test_messages, fail=True), opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True, log_messages=test_messages, fail=False), ] expect_messages = 3 * [test_messages] expect_opstatus = [ constants.OP_STATUS_SUCCESS, constants.OP_STATUS_SUCCESS, constants.OP_STATUS_ERROR, constants.OP_STATUS_ERROR, ] expect_resultlen = 2 elif mode == TM_MULTISUCCESS: ToStdout("Testing multiple successful opcodes") ops = [ opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True, log_messages=test_messages, fail=False), opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True, log_messages=test_messages, fail=False), ] expect_messages = 2 * [test_messages] expect_opstatus = [ constants.OP_STATUS_SUCCESS, constants.OP_STATUS_SUCCESS, ] expect_resultlen = 2 else: if mode == TM_SUCCESS: ToStdout("Testing job success") expect_opstatus = [constants.OP_STATUS_SUCCESS] elif mode == TM_FAIL: ToStdout("Testing job failure") expect_opstatus = [constants.OP_STATUS_ERROR] else: raise errors.ProgrammerError("Unknown test mode %s" % mode) ops = [ opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True, log_messages=test_messages, fail=fail), ] expect_messages = [test_messages] expect_resultlen = 1 cl = cli.GetClient() cli.SetGenericOpcodeOpts(ops, opts) # Send job to master daemon job_id = cli.SendJob(ops, cl=cl) reporter = _JobQueueTestReporter() results = None try: results = cli.PollJob(job_id, cl=cl, reporter=reporter) except errors.OpExecError, err: if not fail: raise ToStdout("Ignoring error for 'job fail' test: %s", err) else: if fail: raise errors.OpExecError("Job didn't fail when it should") # Check length of result if fail: if results is not None: raise errors.OpExecError("Received result from failed job") elif len(results) != expect_resultlen: raise errors.OpExecError("Received %s results (%s), expected %s" % (len(results), results, expect_resultlen)) # Check received log messages all_messages = [i for j in expect_messages for i in j] if reporter.GetTestMessages() != all_messages: raise errors.OpExecError("Received test messages don't match input" " (input %r, received %r)" % (all_messages, reporter.GetTestMessages())) # Check final status reported_job_id = reporter.GetJobId() if reported_job_id != job_id: raise errors.OpExecError("Reported job ID %s doesn't match" "submission job ID %s" % (reported_job_id, job_id)) jobdetails = cli.GetClient().QueryJobs([job_id], ["status", "opstatus"])[0] if not jobdetails: raise errors.OpExecError("Can't find job %s" % job_id) if fail: exp_status = constants.JOB_STATUS_ERROR else: exp_status = constants.JOB_STATUS_SUCCESS (final_status, final_opstatus) = jobdetails if final_status != exp_status: raise errors.OpExecError("Final job status is %s, not %s as expected" % (final_status, exp_status)) if len(final_opstatus) != len(ops): raise errors.OpExecError("Did not receive status for all opcodes (got %s," " expected %s)" % (len(final_opstatus), len(ops))) if final_opstatus != expect_opstatus: raise errors.OpExecError("Opcode status is %s, expected %s" % (final_opstatus, expect_opstatus))
depends=[(-1, [])])], [opcodes.OpTestDelay(duration=1, depends=[(-2, [constants.JOB_STATUS_SUCCESS])])], [opcodes.OpTestDelay(duration=1, depends=[])], [opcodes.OpTestDelay(duration=1, depends=[(-2, [constants.JOB_STATUS_SUCCESS])])], ] # Function for checking result check_fn = ht.TListOf(ht.TAnd(ht.TIsLength(2), ht.TItems([ht.TBool, ht.TOr(ht.TNonEmptyString, ht.TJobId)]))) cl = cli.GetClient() result = cl.SubmitManyJobs(jobs) if not check_fn(result): raise errors.OpExecError("Job submission doesn't match %s: %s" % (check_fn, result)) # Wait for jobs to finish jex = JobExecutor(cl=cl, opts=opts) for (status, job_id) in result: jex.AddJobId(None, status, job_id) job_results = jex.GetResults() if not compat.all(row[0] for row in job_results): raise errors.OpExecError("At least one of the submitted jobs failed: %s" % job_results)
def _TestJobDependency(opts): """Tests job dependencies. """ ToStdout("Testing job dependencies") try: cl = cli.GetClient() SubmitOpCode(opcodes.OpTestDelay(duration=0, depends=[(-1, None)]), cl=cl) except errors.GenericError as err: if opts.debug: ToStdout("Ignoring error for 'wrong dependencies' test: %s", err) else: raise errors.OpExecError("Submitting plain opcode with relative job ID" " did not fail as expected") # TODO: Test dependencies on errors jobs = [ [opcodes.OpTestDelay(duration=1)], [opcodes.OpTestDelay(duration=1, depends=[(-1, [])])], [ opcodes.OpTestDelay(duration=1, depends=[(-2, [constants.JOB_STATUS_SUCCESS])]) ], [opcodes.OpTestDelay(duration=1, depends=[])], [ opcodes.OpTestDelay(duration=1, depends=[(-2, [constants.JOB_STATUS_SUCCESS])]) ], ] # Function for checking result check_fn = ht.TListOf( ht.TAnd(ht.TIsLength(2), ht.TItems([ht.TBool, ht.TOr(ht.TNonEmptyString, ht.TJobId)]))) cl = cli.GetClient() result = cl.SubmitManyJobs(jobs) if not check_fn(result): raise errors.OpExecError("Job submission doesn't match %s: %s" % (check_fn, result)) # Wait for jobs to finish jex = JobExecutor(cl=cl, opts=opts) for (status, job_id) in result: jex.AddJobId(None, status, job_id) job_results = jex.GetResults() if not compat.all(row[0] for row in job_results): raise errors.OpExecError( "At least one of the submitted jobs failed: %s" % job_results) # Get details about jobs data = cl.QueryJobs([job_id for (_, job_id) in result], ["id", "opexec", "ops"]) data_job_id = [job_id for (job_id, _, _) in data] data_opexec = [opexec for (_, opexec, _) in data] data_op = [[opcodes.OpCode.LoadOpCode(op) for op in ops] for (_, _, ops) in data] assert compat.all(not op.depends or len(op.depends) == 1 for ops in data_op for op in ops) # Check resolved job IDs in dependencies for (job_idx, res_jobdep) in [(1, data_job_id[0]), (2, data_job_id[0]), (4, data_job_id[2])]: if data_op[job_idx][0].depends[0][0] != res_jobdep: raise errors.OpExecError( "Job %s's opcode doesn't depend on correct job" " ID (%s)" % (job_idx, res_jobdep)) # Check execution order if not (data_opexec[0] <= data_opexec[1] and data_opexec[0] <= data_opexec[2] and data_opexec[2] <= data_opexec[4]): raise errors.OpExecError("Jobs did not run in correct order: %s" % data) assert len(jobs) == 5 and compat.all(len(ops) == 1 for ops in jobs) ToStdout("Job dependency tests were successful")
except errors.OpPrereqError, err: # this is, from cli.GetClient, a not-master case raise NotMasterError("Not on master node (%s)" % err) except rpcerr.NoMasterError, err: if not try_restart: raise logging.warning( "Master daemon seems to be down (%s), trying to restart", err) if not utils.EnsureDaemon(constants.MASTERD): raise errors.GenericError("Can't start the master daemon") # Retry the connection return cli.GetClient(query=query) def _StartGroupChildren(cl, wait): """Starts a new instance of the watcher for every node group. """ assert not compat.any( arg.startswith(cli.NODEGROUP_OPT_NAME) for arg in sys.argv) result = cl.QueryGroups([], ["name", "uuid"], False) children = [] for (idx, (name, uuid)) in enumerate(result): args = sys.argv + [cli.NODEGROUP_OPT_NAME, uuid]
def get_instance_info(instance, logger): """Query Ganeti to a get the instance's info(NICs, Disks, and hvparams) Get instance's info from Ganeti configuration data. If running on master, query Ganeti via Ganeti CLI client. Otherwise, get info straight from Ganeti's configuration file. @type instance: string @param instance: the name of the instance @rtype: instance's info (NICs, Disks, and hvparms) @return: Dictionary containing the 'nics', 'disks' and 'hvparms' of the instance. """ try: client = cli.GetClient() q_fields = ["nic.names", "nic.networks.names", "nic.ips", "nic.macs", "nic.modes", "nic.links", "nic.uuids", "tags", "disk.names", "disk.sizes", "disk.uuids", "hv/boot_order", 'hv/cdrom_image_path'] info = client.QueryInstances([instance], q_fields, use_locking=False) # Parse NICs names, networks, ips, macs, modes, links, uuids, tags = info[0][:-5] nic_keys = ["name", "network", "ip", "mac", "mode", "link", "uuid"] nics = zip(names, networks, ips, macs, modes, links, uuids) nics = map(lambda x: dict(zip(nic_keys, x)), nics) # Parse Disks names, sizes, uuids = info[0][-5:-2] disk_keys = ["name", "size", "uuid"] disks = zip(names, sizes, uuids) disks = map(lambda x: dict(zip(disk_keys, x)), disks) hvparams = {'boot_order': info[0][-2], 'cdrom_image_path': info[0][-1]} except ganeti_errors.OpPrereqError: # Not running on master! Load the conf file raw_data = utils.ReadFile(pathutils.CLUSTER_CONF_FILE) config = serializer.LoadJson(raw_data) i = config["instances"][instance] # Parse NICs nics = [] for index, nic in enumerate(i["nics"]): params = nic.pop("nicparams") nic["mode"] = params["mode"] nic["link"] = params["link"] nic["index"] = index nics.append(nic) # Parse Disks disks = [] for index, disk in enumerate(i["disks"]): disks.append({"name": disk.pop("name"), "size": disk["size"], "uuid": disk["uuid"], "index": index}) tags = i.get("tags", []) hvparams = {"boot_order": i["hvparams"]["boot_order"], "cdrom_image_path": i["hvparams"]["cdrom_image_path"]} # Get firewall from instance Tags # Tags are of the form synnefo:network:N:firewall_mode for tag in tags: t = tag.split(":") if t[0:2] == ["synnefo", "network"]: if len(t) != 4: logger.error("Malformed synefo tag %s", tag) continue nic_name = t[2] firewall = t[3] [nic.setdefault("firewall", firewall) for nic in nics if nic["name"] == nic_name] attachments = {"nics": nics, "disks": disks, "hvparams": hvparams} return attachments
except errors.OpPrereqError, err: # this is, from cli.GetClient, a not-master case raise NotMasterError("Not on master node (%s)" % err) except rpcerr.NoMasterError, err: if not try_restart: raise logging.warning("Luxi daemon seems to be down (%s), trying to restart", err) if not utils.EnsureDaemon(constants.LUXID): raise errors.GenericError("Can't start the master daemon") # Retry the connection return cli.GetClient() def _StartGroupChildren(cl, wait): """Starts a new instance of the watcher for every node group. """ assert not compat.any( arg.startswith(cli.NODEGROUP_OPT_NAME) for arg in sys.argv) result = cl.QueryGroups([], ["name", "uuid"], False) children = [] for (idx, (name, uuid)) in enumerate(result): args = sys.argv + [cli.NODEGROUP_OPT_NAME, uuid]