Ejemplo n.º 1
0
def GetLuxiClient(try_restart):
    """Tries to connect to the luxi daemon.

  @type try_restart: bool
  @param try_restart: Whether to attempt to restart the master daemon

  """
    try:
        return cli.GetClient()
    except errors.OpPrereqError as err:
        # this is, from cli.GetClient, a not-master case
        raise NotMasterError("Not on master node (%s)" % err)

    except (rpcerr.NoMasterError, rpcerr.TimeoutError) as err:
        if not try_restart:
            raise

        logging.warning("Luxi daemon seems to be down (%s), trying to restart",
                        err)

        if not utils.EnsureDaemon(constants.LUXID):
            raise errors.GenericError("Can't start the master daemon")

        # Retry the connection
        return cli.GetClient()
Ejemplo n.º 2
0
def _TestJobSubmission(opts):
    """Tests submitting jobs.

  """
    ToStdout("Testing job submission")

    testdata = [
        (0, 0, constants.OP_PRIO_LOWEST),
        (0, 0, constants.OP_PRIO_HIGHEST),
    ]

    for priority in (constants.OP_PRIO_SUBMIT_VALID | frozenset(
        [constants.OP_PRIO_LOWEST, constants.OP_PRIO_HIGHEST])):
        for offset in [-1, +1]:
            testdata.extend([
                (0, 0, priority + offset),
                (3, 0, priority + offset),
                (0, 3, priority + offset),
                (4, 2, priority + offset),
            ])

    for before, after, failpriority in testdata:
        ops = []
        ops.extend([opcodes.OpTestDelay(duration=0) for _ in range(before)])
        ops.append(opcodes.OpTestDelay(duration=0, priority=failpriority))
        ops.extend([opcodes.OpTestDelay(duration=0) for _ in range(after)])

        try:
            cl = cli.GetClient()
            cl.SubmitJob(ops)
        except errors.GenericError as err:
            if opts.debug:
                ToStdout("Ignoring error for 'wrong priority' test: %s", err)
        else:
            raise errors.OpExecError(
                "Submitting opcode with priority %s did not"
                " fail when it should (allowed are %s)" %
                (failpriority, constants.OP_PRIO_SUBMIT_VALID))

        jobs = [
            [
                opcodes.OpTestDelay(duration=0),
                opcodes.OpTestDelay(duration=0, dry_run=False),
                opcodes.OpTestDelay(duration=0, dry_run=True)
            ],
            ops,
        ]
        try:
            cl = cli.GetClient()
            cl.SubmitManyJobs(jobs)
        except errors.GenericError as err:
            if opts.debug:
                ToStdout("Ignoring error for 'wrong priority' test: %s", err)
        else:
            raise errors.OpExecError(
                "Submitting manyjobs with an incorrect one"
                " did not fail when it should.")
    ToStdout("Job submission tests were successful")
Ejemplo n.º 3
0
def get_instance_nics(instance, logger):
    """Query Ganeti to a get the instance's NICs.

    Get instance's NICs from Ganeti configuration data. If running on master,
    query Ganeti via Ganeti CLI client. Otherwise, get the nics from Ganeti
    configuration file.

    @type instance: string
    @param instance: the name of the instance
    @rtype: List of dicts
    @return: Dictionary containing the instance's NICs. Each dictionary
             contains the following keys: 'network', 'ip', 'mac', 'mode',
             'link' and 'firewall'

    """
    try:
        client = cli.GetClient()
        fields = [
            "nic.networks", "nic.ips", "nic.macs", "nic.modes", "nic.links",
            "tags"
        ]
        info = client.QueryInstances([instance], fields, use_locking=False)
        networks, ips, macs, modes, links, tags = info[0]
        nic_keys = ["network", "ip", "mac", "mode", "link"]
        nics = zip(networks, ips, macs, modes, links)
        nics = map(lambda x: dict(zip(nic_keys, x)), nics)
    except ganeti_errors.OpPrereqError:
        # Not running on master! Load the conf file
        raw_data = utils.ReadFile(constants.CLUSTER_CONF_FILE)
        config = serializer.LoadJson(raw_data)
        i = config["instances"][instance]
        nics = []
        for nic in i["nics"]:
            params = nic.pop("nicparams")
            nic["mode"] = params["mode"]
            nic["link"] = params["link"]
            nics.append(nic)
        tags = i.get("tags", [])
    # Get firewall from instance Tags
    # Tags are of the form synnefo:network:N:firewall_mode
    for tag in tags:
        t = tag.split(":")
        if t[0:2] == ["synnefo", "network"]:
            if len(t) != 4:
                logger.error("Malformed synefo tag %s", tag)
                continue
            try:
                index = int(t[2])
                nics[index]['firewall'] = t[3]
            except ValueError:
                logger.error("Malformed synnefo tag %s", tag)
            except IndexError:
                logger.error("Found tag %s for non-existent NIC %d", tag,
                             index)
    return nics
Ejemplo n.º 4
0
def _TestJobDependency(opts):
  """Tests job dependencies.

  """
  ToStdout("Testing job dependencies")

  try:
    cl = cli.GetClient()
    SubmitOpCode(opcodes.OpTestDelay(duration=0, depends=[(-1, None)]), cl=cl)
  except errors.GenericError, err:
    if opts.debug:
      ToStdout("Ignoring error for 'wrong dependencies' test: %s", err)
Ejemplo n.º 5
0
def GetLuxiClient(try_restart, query=False):
    """Tries to connect to the master daemon.

  @type try_restart: bool
  @param try_restart: Whether to attempt to restart the master daemon

  """
    try:
        return cli.GetClient(query=query)
    except errors.OpPrereqError, err:
        # this is, from cli.GetClient, a not-master case
        raise NotMasterError("Not on master node (%s)" % err)
Ejemplo n.º 6
0
 def __init__(self):
     """Constructor."""
     self.url_opener = SimpleOpener()
     self._feed_buf = StringIO()
     self.nodes = []
     self.instances = []
     self.to_rem = []
     self.queued_ops = []
     self.opts = None
     self.queue_retry = False
     self.disk_count = self.disk_growth = self.disk_size = None
     self.hvp = self.bep = None
     self.ParseOptions()
     self.cl = cli.GetClient()
     self.GetState()
Ejemplo n.º 7
0
def GenericOpCodes(opts, args):
  """Send any opcode to the master.

  @param opts: the command line options selected by the user
  @type args: list
  @param args: should contain only one element, the path of
      the file with the opcode definition
  @rtype: int
  @return: the desired exit code

  """
  cl = cli.GetClient()
  jex = cli.JobExecutor(cl=cl, verbose=opts.verbose, opts=opts)

  job_cnt = 0
  op_cnt = 0
  if opts.timing_stats:
    ToStdout("Loading...")
  for job_idx in range(opts.rep_job):
    for fname in args:
      # pylint: disable=W0142
      op_data = simplejson.loads(utils.ReadFile(fname))
      op_list = [opcodes.OpCode.LoadOpCode(val) for val in op_data]
      op_list = op_list * opts.rep_op
      jex.QueueJob("file %s/%d" % (fname, job_idx), *op_list)
      op_cnt += len(op_list)
      job_cnt += 1

  if opts.timing_stats:
    t1 = time.time()
    ToStdout("Submitting...")

  jex.SubmitPending(each=opts.each)

  if opts.timing_stats:
    t2 = time.time()
    ToStdout("Executing...")

  jex.GetResults()
  if opts.timing_stats:
    t3 = time.time()
    ToStdout("C:op     %4d" % op_cnt)
    ToStdout("C:job    %4d" % job_cnt)
    ToStdout("T:submit %4.4f" % (t2 - t1))
    ToStdout("T:exec   %4.4f" % (t3 - t2))
    ToStdout("T:total  %4.4f" % (t3 - t1))
  return 0
Ejemplo n.º 8
0
    def _ProcessTestMessage(self, job_id, sockname, test, arg):
        """Handles a job queue test message.

    """
        if test not in constants.JQT_ALL:
            raise errors.OpExecError("Received invalid test message %s" % test)

        sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
        try:
            sock.settimeout(30.0)

            logging.debug("Connecting to %s", sockname)
            sock.connect(sockname)

            logging.debug("Checking status")
            jobdetails = cli.GetClient().QueryJobs([job_id], ["status"])[0]
            if not jobdetails:
                raise errors.OpExecError("Can't find job %s" % job_id)

            status = jobdetails[0]

            logging.debug("Status of job %s is %s", job_id, status)

            if test == constants.JQT_EXPANDNAMES:
                if status != constants.JOB_STATUS_WAITING:
                    raise errors.OpExecError(
                        "Job status while expanding names is '%s',"
                        " not '%s' as expected" %
                        (status, constants.JOB_STATUS_WAITING))
            elif test in (constants.JQT_EXEC, constants.JQT_LOGMSG):
                if status != constants.JOB_STATUS_RUNNING:
                    raise errors.OpExecError(
                        "Job status while executing opcode is '%s',"
                        " not '%s' as expected" %
                        (status, constants.JOB_STATUS_RUNNING))

            if test == constants.JQT_STARTMSG:
                logging.debug("Expecting %s test messages", arg)
                self._testmsgs = []
            elif test == constants.JQT_LOGMSG:
                if len(self._testmsgs) != arg:
                    raise errors.OpExecError(
                        "Received %s test messages when %s are"
                        " expected" % (len(self._testmsgs), arg))
        finally:
            logging.debug("Closing socket")
            sock.close()
Ejemplo n.º 9
0
def TestJobqueue(opts, _):
  """Runs a few tests on the job queue.

  """
  _TestJobSubmission(opts)
  _TestJobDependency(opts)

  (TM_SUCCESS,
   TM_MULTISUCCESS,
   TM_FAIL,
   TM_PARTFAIL) = range(4)
  TM_ALL = compat.UniqueFrozenset([
    TM_SUCCESS,
    TM_MULTISUCCESS,
    TM_FAIL,
    TM_PARTFAIL,
    ])

  for mode in TM_ALL:
    test_messages = [
      "Testing mode %s" % mode,
      "Hello World",
      "A",
      "",
      "B",
      "Foo|bar|baz",
      utils.TimestampForFilename(),
      ]

    fail = mode in (TM_FAIL, TM_PARTFAIL)

    if mode == TM_PARTFAIL:
      ToStdout("Testing partial job failure")
      ops = [
        opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True,
                             log_messages=test_messages, fail=False),
        opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True,
                             log_messages=test_messages, fail=False),
        opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True,
                             log_messages=test_messages, fail=True),
        opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True,
                             log_messages=test_messages, fail=False),
        ]
      expect_messages = 3 * [test_messages]
      expect_opstatus = [
        constants.OP_STATUS_SUCCESS,
        constants.OP_STATUS_SUCCESS,
        constants.OP_STATUS_ERROR,
        constants.OP_STATUS_ERROR,
        ]
      expect_resultlen = 2
    elif mode == TM_MULTISUCCESS:
      ToStdout("Testing multiple successful opcodes")
      ops = [
        opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True,
                             log_messages=test_messages, fail=False),
        opcodes.OpTestJqueue(notify_waitlock=True, notify_exec=True,
                             log_messages=test_messages, fail=False),
        ]
      expect_messages = 2 * [test_messages]
      expect_opstatus = [
        constants.OP_STATUS_SUCCESS,
        constants.OP_STATUS_SUCCESS,
        ]
      expect_resultlen = 2
    else:
      if mode == TM_SUCCESS:
        ToStdout("Testing job success")
        expect_opstatus = [constants.OP_STATUS_SUCCESS]
      elif mode == TM_FAIL:
        ToStdout("Testing job failure")
        expect_opstatus = [constants.OP_STATUS_ERROR]
      else:
        raise errors.ProgrammerError("Unknown test mode %s" % mode)

      ops = [
        opcodes.OpTestJqueue(notify_waitlock=True,
                             notify_exec=True,
                             log_messages=test_messages,
                             fail=fail),
        ]
      expect_messages = [test_messages]
      expect_resultlen = 1

    cl = cli.GetClient()
    cli.SetGenericOpcodeOpts(ops, opts)

    # Send job to master daemon
    job_id = cli.SendJob(ops, cl=cl)

    reporter = _JobQueueTestReporter()
    results = None

    try:
      results = cli.PollJob(job_id, cl=cl, reporter=reporter)
    except errors.OpExecError, err:
      if not fail:
        raise
      ToStdout("Ignoring error for 'job fail' test: %s", err)
    else:
      if fail:
        raise errors.OpExecError("Job didn't fail when it should")

    # Check length of result
    if fail:
      if results is not None:
        raise errors.OpExecError("Received result from failed job")
    elif len(results) != expect_resultlen:
      raise errors.OpExecError("Received %s results (%s), expected %s" %
                               (len(results), results, expect_resultlen))

    # Check received log messages
    all_messages = [i for j in expect_messages for i in j]
    if reporter.GetTestMessages() != all_messages:
      raise errors.OpExecError("Received test messages don't match input"
                               " (input %r, received %r)" %
                               (all_messages, reporter.GetTestMessages()))

    # Check final status
    reported_job_id = reporter.GetJobId()
    if reported_job_id != job_id:
      raise errors.OpExecError("Reported job ID %s doesn't match"
                               "submission job ID %s" %
                               (reported_job_id, job_id))

    jobdetails = cli.GetClient().QueryJobs([job_id], ["status", "opstatus"])[0]
    if not jobdetails:
      raise errors.OpExecError("Can't find job %s" % job_id)

    if fail:
      exp_status = constants.JOB_STATUS_ERROR
    else:
      exp_status = constants.JOB_STATUS_SUCCESS

    (final_status, final_opstatus) = jobdetails
    if final_status != exp_status:
      raise errors.OpExecError("Final job status is %s, not %s as expected" %
                               (final_status, exp_status))
    if len(final_opstatus) != len(ops):
      raise errors.OpExecError("Did not receive status for all opcodes (got %s,"
                               " expected %s)" %
                               (len(final_opstatus), len(ops)))
    if final_opstatus != expect_opstatus:
      raise errors.OpExecError("Opcode status is %s, expected %s" %
                               (final_opstatus, expect_opstatus))
Ejemplo n.º 10
0
                         depends=[(-1, [])])],
    [opcodes.OpTestDelay(duration=1,
                         depends=[(-2, [constants.JOB_STATUS_SUCCESS])])],
    [opcodes.OpTestDelay(duration=1,
                         depends=[])],
    [opcodes.OpTestDelay(duration=1,
                         depends=[(-2, [constants.JOB_STATUS_SUCCESS])])],
    ]

  # Function for checking result
  check_fn = ht.TListOf(ht.TAnd(ht.TIsLength(2),
                                ht.TItems([ht.TBool,
                                           ht.TOr(ht.TNonEmptyString,
                                                  ht.TJobId)])))

  cl = cli.GetClient()
  result = cl.SubmitManyJobs(jobs)
  if not check_fn(result):
    raise errors.OpExecError("Job submission doesn't match %s: %s" %
                             (check_fn, result))

  # Wait for jobs to finish
  jex = JobExecutor(cl=cl, opts=opts)

  for (status, job_id) in result:
    jex.AddJobId(None, status, job_id)

  job_results = jex.GetResults()
  if not compat.all(row[0] for row in job_results):
    raise errors.OpExecError("At least one of the submitted jobs failed: %s" %
                             job_results)
Ejemplo n.º 11
0
def _TestJobDependency(opts):
    """Tests job dependencies.

  """
    ToStdout("Testing job dependencies")

    try:
        cl = cli.GetClient()
        SubmitOpCode(opcodes.OpTestDelay(duration=0, depends=[(-1, None)]),
                     cl=cl)
    except errors.GenericError as err:
        if opts.debug:
            ToStdout("Ignoring error for 'wrong dependencies' test: %s", err)
    else:
        raise errors.OpExecError("Submitting plain opcode with relative job ID"
                                 " did not fail as expected")

    # TODO: Test dependencies on errors
    jobs = [
        [opcodes.OpTestDelay(duration=1)],
        [opcodes.OpTestDelay(duration=1, depends=[(-1, [])])],
        [
            opcodes.OpTestDelay(duration=1,
                                depends=[(-2, [constants.JOB_STATUS_SUCCESS])])
        ],
        [opcodes.OpTestDelay(duration=1, depends=[])],
        [
            opcodes.OpTestDelay(duration=1,
                                depends=[(-2, [constants.JOB_STATUS_SUCCESS])])
        ],
    ]

    # Function for checking result
    check_fn = ht.TListOf(
        ht.TAnd(ht.TIsLength(2),
                ht.TItems([ht.TBool,
                           ht.TOr(ht.TNonEmptyString, ht.TJobId)])))

    cl = cli.GetClient()
    result = cl.SubmitManyJobs(jobs)
    if not check_fn(result):
        raise errors.OpExecError("Job submission doesn't match %s: %s" %
                                 (check_fn, result))

    # Wait for jobs to finish
    jex = JobExecutor(cl=cl, opts=opts)

    for (status, job_id) in result:
        jex.AddJobId(None, status, job_id)

    job_results = jex.GetResults()
    if not compat.all(row[0] for row in job_results):
        raise errors.OpExecError(
            "At least one of the submitted jobs failed: %s" % job_results)

    # Get details about jobs
    data = cl.QueryJobs([job_id for (_, job_id) in result],
                        ["id", "opexec", "ops"])
    data_job_id = [job_id for (job_id, _, _) in data]
    data_opexec = [opexec for (_, opexec, _) in data]
    data_op = [[opcodes.OpCode.LoadOpCode(op) for op in ops]
               for (_, _, ops) in data]

    assert compat.all(not op.depends or len(op.depends) == 1 for ops in data_op
                      for op in ops)

    # Check resolved job IDs in dependencies
    for (job_idx, res_jobdep) in [(1, data_job_id[0]), (2, data_job_id[0]),
                                  (4, data_job_id[2])]:
        if data_op[job_idx][0].depends[0][0] != res_jobdep:
            raise errors.OpExecError(
                "Job %s's opcode doesn't depend on correct job"
                " ID (%s)" % (job_idx, res_jobdep))

    # Check execution order
    if not (data_opexec[0] <= data_opexec[1]
            and data_opexec[0] <= data_opexec[2]
            and data_opexec[2] <= data_opexec[4]):
        raise errors.OpExecError("Jobs did not run in correct order: %s" %
                                 data)

    assert len(jobs) == 5 and compat.all(len(ops) == 1 for ops in jobs)

    ToStdout("Job dependency tests were successful")
Ejemplo n.º 12
0
    except errors.OpPrereqError, err:
        # this is, from cli.GetClient, a not-master case
        raise NotMasterError("Not on master node (%s)" % err)

    except rpcerr.NoMasterError, err:
        if not try_restart:
            raise

        logging.warning(
            "Master daemon seems to be down (%s), trying to restart", err)

        if not utils.EnsureDaemon(constants.MASTERD):
            raise errors.GenericError("Can't start the master daemon")

        # Retry the connection
        return cli.GetClient(query=query)


def _StartGroupChildren(cl, wait):
    """Starts a new instance of the watcher for every node group.

  """
    assert not compat.any(
        arg.startswith(cli.NODEGROUP_OPT_NAME) for arg in sys.argv)

    result = cl.QueryGroups([], ["name", "uuid"], False)

    children = []

    for (idx, (name, uuid)) in enumerate(result):
        args = sys.argv + [cli.NODEGROUP_OPT_NAME, uuid]
Ejemplo n.º 13
0
def get_instance_info(instance, logger):
    """Query Ganeti to a get the instance's info(NICs, Disks, and hvparams)

    Get instance's info from Ganeti configuration data. If running on
    master, query Ganeti via Ganeti CLI client. Otherwise, get info
    straight from Ganeti's configuration file.

    @type instance: string
    @param instance: the name of the instance
    @rtype: instance's info (NICs, Disks, and hvparms)
    @return: Dictionary containing the 'nics', 'disks' and 'hvparms' of the
             instance.

    """
    try:
        client = cli.GetClient()
        q_fields = ["nic.names", "nic.networks.names", "nic.ips", "nic.macs",
                    "nic.modes", "nic.links", "nic.uuids", "tags",
                    "disk.names", "disk.sizes", "disk.uuids",
                    "hv/boot_order", 'hv/cdrom_image_path']
        info = client.QueryInstances([instance], q_fields, use_locking=False)
        # Parse NICs
        names, networks, ips, macs, modes, links, uuids, tags = info[0][:-5]
        nic_keys = ["name", "network", "ip", "mac", "mode", "link", "uuid"]
        nics = zip(names, networks, ips, macs, modes, links, uuids)
        nics = map(lambda x: dict(zip(nic_keys, x)), nics)
        # Parse Disks
        names, sizes, uuids = info[0][-5:-2]
        disk_keys = ["name", "size", "uuid"]
        disks = zip(names, sizes, uuids)
        disks = map(lambda x: dict(zip(disk_keys, x)), disks)

        hvparams = {'boot_order': info[0][-2],
                    'cdrom_image_path': info[0][-1]}

    except ganeti_errors.OpPrereqError:
        # Not running on master! Load the conf file
        raw_data = utils.ReadFile(pathutils.CLUSTER_CONF_FILE)
        config = serializer.LoadJson(raw_data)
        i = config["instances"][instance]
        # Parse NICs
        nics = []
        for index, nic in enumerate(i["nics"]):
            params = nic.pop("nicparams")
            nic["mode"] = params["mode"]
            nic["link"] = params["link"]
            nic["index"] = index
            nics.append(nic)
        # Parse Disks
        disks = []
        for index, disk in enumerate(i["disks"]):
            disks.append({"name": disk.pop("name"),
                          "size": disk["size"],
                          "uuid": disk["uuid"],
                          "index": index})
        tags = i.get("tags", [])

        hvparams = {"boot_order": i["hvparams"]["boot_order"],
                    "cdrom_image_path": i["hvparams"]["cdrom_image_path"]}
    # Get firewall from instance Tags
    # Tags are of the form synnefo:network:N:firewall_mode
    for tag in tags:
        t = tag.split(":")
        if t[0:2] == ["synnefo", "network"]:
            if len(t) != 4:
                logger.error("Malformed synefo tag %s", tag)
                continue
            nic_name = t[2]
            firewall = t[3]
            [nic.setdefault("firewall", firewall)
             for nic in nics if nic["name"] == nic_name]
    attachments = {"nics": nics,
                   "disks": disks,
                   "hvparams": hvparams}
    return attachments
Ejemplo n.º 14
0
    except errors.OpPrereqError, err:
        # this is, from cli.GetClient, a not-master case
        raise NotMasterError("Not on master node (%s)" % err)

    except rpcerr.NoMasterError, err:
        if not try_restart:
            raise

        logging.warning("Luxi daemon seems to be down (%s), trying to restart",
                        err)

        if not utils.EnsureDaemon(constants.LUXID):
            raise errors.GenericError("Can't start the master daemon")

        # Retry the connection
        return cli.GetClient()


def _StartGroupChildren(cl, wait):
    """Starts a new instance of the watcher for every node group.

  """
    assert not compat.any(
        arg.startswith(cli.NODEGROUP_OPT_NAME) for arg in sys.argv)

    result = cl.QueryGroups([], ["name", "uuid"], False)

    children = []

    for (idx, (name, uuid)) in enumerate(result):
        args = sys.argv + [cli.NODEGROUP_OPT_NAME, uuid]