Exemple #1
0
    def _first_heartbeat(self, sender, beat):

        node_id = beat.get('node_id')
        if not node_id:
            log.error("EE heartbeat from %s without a node_id!: %s", sender, beat)
            return

        node = self.store.get_node(node_id)
        if node is None:
            log.warn("EE heartbeat from unknown node. Still booting? " +
                     "node_id=%s sender=%s.", node_id, sender)

            # TODO I'm thinking the best thing to do here is query EPUM
            # for the state of this node in case the initial node_state
            # update got lost. Note that we shouldn't go ahead and
            # schedule processes onto this EE until we get the RUNNING
            # node_state update -- there could be a failure later on in
            # the contextualization process that triggers the node to be
            # terminated.

            return

        if node.properties:
            properties = node.properties.copy()
        else:
            properties = {}

        log.info("First heartbeat from EEAgent %s on node %s (%s)",
            sender, node_id, properties.get("hostname", "unknown hostname"))

        try:
            engine_id = engine_id_from_domain(node.domain_id)
        except ValueError:
            log.exception("Node for EEagent %s has invalid domain_id!", sender)
            return

        engine_spec = self.get_engine(engine_id)
        slots = engine_spec.slots

        # just making engine type a generic property/constraint for now,
        # until it is clear something more formal is needed.
        properties['engine'] = engine_id

        try:
            self.node_add_resource(node, sender)
        except NotFoundError:
            log.warn("Node removed while processing heartbeat. ignoring. "
                     "node_id=%s sender=%s.", node_id, sender)
            return

        timestamp_str = beat['timestamp']
        timestamp = ceiling_datetime(parse_datetime(timestamp_str))

        resource = ResourceRecord.new(sender, node_id, slots, properties)
        resource.new_last_heartbeat_datetime(timestamp)
        try:
            self.store.add_resource(resource)
        except WriteConflictError:
            # no problem if this resource was just created by another worker
            log.info("Conflict writing new resource record %s. Ignoring.", sender)
Exemple #2
0
    def test_heartbeat_timestamps(self):

        # test processing a heartbeat where node is removed partway through
        node_id = uuid.uuid4().hex
        self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING)

        d1 = parse_datetime("2013-04-02T19:37:57.617734+00:00")
        d2 = parse_datetime("2013-04-02T19:38:57.617734+00:00")
        d3 = parse_datetime("2013-04-02T19:39:57.617734+00:00")

        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d1.isoformat()))

        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d1)

        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d3.isoformat()))
        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d3)

        # out of order hbeat. time shouln't be updated
        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d2.isoformat()))
        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d3)
Exemple #3
0
    def ee_heartbeat(self, sender, beat):
        """Incoming heartbeat from an EEAgent

        @param sender: ION name of sender
        @param beat: information about running processes
        @return:

        When an EEAgent starts, it immediately begins sending heartbeats to
        the PD. The first received heartbeat will trigger the PD to mark the
        EE as available in its slot tables, and potentially start deploying
        some WAITING process requests.

        The heartbeat message will consist of at least these fields:
            - node id - unique ID for the provisioned resource (VM) the EE runs on
            - timestamp - time heartbeat was generated
            - processes - list of running process IDs
        """

        # sender can be in the format $sysname.$eename when CFG.dashi.sysname
        # is set, or it will be just $eename, if there is no sysname set.
        # We need to make sure that we remove the sysname when it is enabled to
        # get the correct eeagent name.
        if '.' in sender:
            sender = sender.split('.')[-1]
        resource = self.store.get_resource(sender)
        if resource is None:
            # first heartbeat from this EE
            self._first_heartbeat(sender, beat)
            return  # *** EARLY RETURN **

        resource_updated = False

        timestamp_str = beat['timestamp']
        timestamp = ceiling_datetime(parse_datetime(timestamp_str))

        resource_timestamp = resource.last_heartbeat_datetime
        if resource_timestamp is None or timestamp > resource_timestamp:
            resource.new_last_heartbeat_datetime(timestamp)
            resource_updated = True

        assigned_procs = set()
        processes = beat['processes']
        node_exclusives_to_remove = []
        for procstate in processes:
            upid = procstate['upid']
            round = int(procstate['round'])
            state = procstate['state']

            # TODO hack to handle how states are formatted in EEAgent heartbeat
            if isinstance(state, (list, tuple)):
                state = "-".join(str(s) for s in state)

            # TODO owner?
            process = self.store.get_process(None, upid)
            if not process:
                log.warn("EE reports process %s that is unknown!", upid)

                if state < ProcessState.TERMINATED:
                    assigned_procs.add((None, upid, round))
                else:
                    self.eeagent_client.cleanup_process(sender, upid, round)

                continue

            if round < process.round:
                # skip heartbeat info for processes that are already redeploying
                # but send a cleanup request first
                self.eeagent_client.cleanup_process(sender, upid, round)
                continue

            if state == process.state:

                # if we managed to update the process record already for a
                # terminated process but didn't update the resource record,
                # clean up the process
                if state >= ProcessState.TERMINATED:
                    self.eeagent_client.cleanup_process(sender, upid, round)

                continue

            if process.state == ProcessState.PENDING and \
               state == ProcessState.RUNNING:

                assigned_procs.add(process.key)

                # mark as running and notify subscriber
                process, changed = self.process_change_state(
                    process, ProcessState.RUNNING)

            elif state in (ProcessState.TERMINATED, ProcessState.FAILED,
                           ProcessState.EXITED):

                # process has died in resource. Obvious culprit is that it was
                # killed on request.

                if process.node_exclusive:
                    node_exclusives_to_remove.append(process.node_exclusive)

                if process.state == ProcessState.TERMINATING:
                    # mark as terminated and notify subscriber
                    process, updated = self.process_change_state(
                        process, ProcessState.TERMINATED, assigned=None)

                # otherwise it may need to be rescheduled
                elif process.state in (ProcessState.PENDING,
                                    ProcessState.RUNNING):

                    if self.process_should_restart(process, state):
                        self.process_next_round(process)
                    else:
                        self.process_change_state(process, state, assigned=None)

                # send cleanup request to EEAgent now that we have dealt
                # with the dead process
                self.eeagent_client.cleanup_process(sender, upid, round)

        new_assigned = []
        for owner, upid, round in resource.assigned:
            key = (owner, upid, round)
            process = self.store.get_process(owner, upid)

            if key in assigned_procs:
                new_assigned.append(key)
            # prune process assignments once the process has terminated or
            # moved onto the next round
            elif (process and process.round == round
                 and process.state < ProcessState.TERMINATED):
                new_assigned.append(key)

        if len(new_assigned) != len(resource.assigned):
            # first update node exclusive tags
            if node_exclusives_to_remove:
                node = self.store.get_node(resource.node_id)
                if node:
                    self.node_remove_exclusive_tags(node, node_exclusives_to_remove)
                else:
                    log.warning("Node %s not found while attempting to update node_exclusive",
                        resource.node_id)

            if log.isEnabledFor(logging.DEBUG):
                old_assigned_set = set(tuple(item) for item in resource.assigned)
                new_assigned_set = set(tuple(item) for item in new_assigned)
                difference_message = get_set_difference_debug_message(
                    old_assigned_set, new_assigned_set)
                log.debug("updating resource %s assignments: %s",
                    resource.resource_id, difference_message)

            resource.assigned = new_assigned
            resource_updated = True

        if resource_updated:
            try:
                self.store.update_resource(resource)
            except (WriteConflictError, NotFoundError):
                # TODO? right now this will just wait for the next heartbeat
                pass