Beispiel #1
0
def cluster_node_heartbeat():
    logger.debug("Cluster node heartbeat task.")
    nowtime = now()
    instance_list = list(Instance.objects.all())
    this_inst = None
    lost_instances = []

    for inst in instance_list:
        if inst.hostname == settings.CLUSTER_HOST_ID:
            this_inst = inst
            break

    inspect_execution_nodes(instance_list)

    for inst in list(instance_list):
        if inst == this_inst:
            continue
        if inst.is_lost(ref_time=nowtime):
            lost_instances.append(inst)
            instance_list.remove(inst)

    if this_inst:
        startup_event = this_inst.is_lost(ref_time=nowtime)
        this_inst.local_health_check()
        if startup_event and this_inst.capacity != 0:
            logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
            return
    else:
        raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
    # IFF any node has a greater version than we do, then we'll shutdown services
    for other_inst in instance_list:
        if other_inst.node_type in ('execution', 'hop'):
            continue
        if other_inst.version == "" or other_inst.version.startswith('ansible-runner'):
            continue
        if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG:
            logger.error(
                "Host {} reports version {}, but this node {} is at {}, shutting down".format(
                    other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version
                )
            )
            # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance.
            # The heartbeat task will reset the capacity to the system capacity after upgrade.
            stop_local_services(communicate=False)
            raise RuntimeError("Shutting down.")

    for other_inst in lost_instances:
        try:
            reaper.reap(other_inst)
        except Exception:
            logger.exception('failed to reap jobs for {}'.format(other_inst.hostname))
        try:
            if settings.AWX_AUTO_DEPROVISION_INSTANCES:
                deprovision_hostname = other_inst.hostname
                other_inst.delete()
                logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))
            elif other_inst.capacity != 0 or (not other_inst.errors):
                other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
                logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))

        except DatabaseError as e:
            if 'did not affect any rows' in str(e):
                logger.debug('Another instance has marked {} as lost'.format(other_inst.hostname))
            else:
                logger.exception('Error marking {} as lost'.format(other_inst.hostname))
Beispiel #2
0
def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
    logger.debug("Cluster node heartbeat task.")
    nowtime = now()
    instance_list = list(Instance.objects.all())
    this_inst = None
    lost_instances = []

    for inst in instance_list:
        if inst.hostname == settings.CLUSTER_HOST_ID:
            this_inst = inst
            break

    inspect_execution_nodes(instance_list)

    for inst in list(instance_list):
        if inst == this_inst:
            continue
        if inst.is_lost(ref_time=nowtime):
            lost_instances.append(inst)
            instance_list.remove(inst)

    if this_inst:
        startup_event = this_inst.is_lost(ref_time=nowtime)
        last_last_seen = this_inst.last_seen
        this_inst.local_health_check()
        if startup_event and this_inst.capacity != 0:
            logger.warning(
                f'Rejoining the cluster as instance {this_inst.hostname}. Prior last_seen {last_last_seen}'
            )
            return
        elif not last_last_seen:
            logger.warning(
                f'Instance does not have recorded last_seen, updating to {nowtime}'
            )
        elif (nowtime - last_last_seen) > timedelta(
                seconds=settings.CLUSTER_NODE_HEARTBEAT_PERIOD + 2):
            logger.warning(
                f'Heartbeat skew - interval={(nowtime - last_last_seen).total_seconds():.4f}, expected={settings.CLUSTER_NODE_HEARTBEAT_PERIOD}'
            )
    else:
        if settings.AWX_AUTO_DEPROVISION_INSTANCES:
            (changed, this_inst) = Instance.objects.register(
                ip_address=os.environ.get('MY_POD_IP'),
                node_type='control',
                uuid=settings.SYSTEM_UUID)
            if changed:
                logger.warning(
                    f'Recreated instance record {this_inst.hostname} after unexpected removal'
                )
            this_inst.local_health_check()
        else:
            raise RuntimeError("Cluster Host Not Found: {}".format(
                settings.CLUSTER_HOST_ID))
    # IFF any node has a greater version than we do, then we'll shutdown services
    for other_inst in instance_list:
        if other_inst.node_type in ('execution', 'hop'):
            continue
        if other_inst.version == "" or other_inst.version.startswith(
                'ansible-runner'):
            continue
        if Version(other_inst.version.split('-', 1)[0]) > Version(
                awx_application_version.split('-',
                                              1)[0]) and not settings.DEBUG:
            logger.error(
                "Host {} reports version {}, but this node {} is at {}, shutting down"
                .format(other_inst.hostname, other_inst.version,
                        this_inst.hostname, this_inst.version))
            # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance.
            # The heartbeat task will reset the capacity to the system capacity after upgrade.
            stop_local_services(communicate=False)
            raise RuntimeError("Shutting down.")

    for other_inst in lost_instances:
        try:
            explanation = "Job reaped due to instance shutdown"
            reaper.reap(other_inst, job_explanation=explanation)
            reaper.reap_waiting(other_inst,
                                grace_period=0,
                                job_explanation=explanation)
        except Exception:
            logger.exception('failed to reap jobs for {}'.format(
                other_inst.hostname))
        try:
            if settings.AWX_AUTO_DEPROVISION_INSTANCES:
                deprovision_hostname = other_inst.hostname
                other_inst.delete()
                logger.info("Host {} Automatically Deprovisioned.".format(
                    deprovision_hostname))
            elif other_inst.capacity != 0 or (not other_inst.errors):
                other_inst.mark_offline(errors=_(
                    'Another cluster node has determined this instance to be unresponsive'
                ))
                logger.error(
                    "Host {} last checked in at {}, marked as lost.".format(
                        other_inst.hostname, other_inst.last_seen))

        except DatabaseError as e:
            if 'did not affect any rows' in str(e):
                logger.debug('Another instance has marked {} as lost'.format(
                    other_inst.hostname))
            else:
                logger.exception('Error marking {} as lost'.format(
                    other_inst.hostname))

    # Run local reaper
    if worker_tasks is not None:
        active_task_ids = []
        for task_list in worker_tasks.values():
            active_task_ids.extend(task_list)
        reaper.reap(instance=this_inst, excluded_uuids=active_task_ids)
        if max(len(task_list) for task_list in worker_tasks.values()) <= 1:
            reaper.reap_waiting(instance=this_inst,
                                excluded_uuids=active_task_ids,
                                ref_time=datetime.fromisoformat(dispatch_time))