Beispiel #1
0
class Instance(models.Model):
    """A model representing an AWX instance running against this database."""
    objects = InstanceManager()

    uuid = models.CharField(max_length=40)
    hostname = models.CharField(max_length=250, unique=True)
    created = models.DateTimeField(auto_now_add=True)
    modified = models.DateTimeField(auto_now=True)
    last_isolated_check = models.DateTimeField(null=True,
                                               editable=False,
                                               auto_now_add=True)
    version = models.CharField(max_length=24, blank=True)
    capacity = models.PositiveIntegerField(
        default=100,
        editable=False,
    )

    class Meta:
        app_label = 'main'

    def get_absolute_url(self, request=None):
        return reverse('api:instance_detail',
                       kwargs={'pk': self.pk},
                       request=request)

    @property
    def consumed_capacity(self):
        return sum(x.task_impact for x in UnifiedJob.objects.filter(
            execution_node=self.hostname, status__in=('running', 'waiting')))

    @property
    def role(self):
        # NOTE: TODO: Likely to repurpose this once standalone ramparts are a thing
        return "awx"

    def is_lost(self, ref_time=None, isolated=False):
        if ref_time is None:
            ref_time = now()
        grace_period = 120
        if isolated:
            grace_period = settings.AWX_ISOLATED_PERIODIC_CHECK * 2
        return self.modified < ref_time - timedelta(seconds=grace_period)
Beispiel #2
0
class Instance(HasPolicyEditsMixin, BaseModel):
    """A model representing an AWX instance running against this database."""
    objects = InstanceManager()

    uuid = models.CharField(max_length=40)
    hostname = models.CharField(max_length=250, unique=True)
    created = models.DateTimeField(auto_now_add=True)
    modified = models.DateTimeField(auto_now=True)
    last_isolated_check = models.DateTimeField(
        null=True,
        editable=False,
    )
    version = models.CharField(max_length=24, blank=True)
    capacity = models.PositiveIntegerField(
        default=100,
        editable=False,
    )
    capacity_adjustment = models.DecimalField(
        default=Decimal(1.0),
        max_digits=3,
        decimal_places=2,
        validators=[MinValueValidator(0)])
    enabled = models.BooleanField(default=True)
    managed_by_policy = models.BooleanField(default=True)
    cpu = models.IntegerField(
        default=0,
        editable=False,
    )
    memory = models.BigIntegerField(
        default=0,
        editable=False,
    )
    cpu_capacity = models.IntegerField(
        default=0,
        editable=False,
    )
    mem_capacity = models.IntegerField(
        default=0,
        editable=False,
    )

    class Meta:
        app_label = 'main'
        ordering = ("hostname", )

    POLICY_FIELDS = frozenset(
        ('managed_by_policy', 'hostname', 'capacity_adjustment'))

    def get_absolute_url(self, request=None):
        return reverse('api:instance_detail',
                       kwargs={'pk': self.pk},
                       request=request)

    @property
    def consumed_capacity(self):
        return sum(x.task_impact for x in UnifiedJob.objects.filter(
            execution_node=self.hostname, status__in=('running', 'waiting')))

    @property
    def remaining_capacity(self):
        return self.capacity - self.consumed_capacity

    @property
    def role(self):
        # NOTE: TODO: Likely to repurpose this once standalone ramparts are a thing
        return "awx"

    @property
    def jobs_running(self):
        return UnifiedJob.objects.filter(execution_node=self.hostname,
                                         status__in=(
                                             'running',
                                             'waiting',
                                         )).count()

    @property
    def jobs_total(self):
        return UnifiedJob.objects.filter(execution_node=self.hostname).count()

    def is_lost(self, ref_time=None, isolated=False):
        if ref_time is None:
            ref_time = now()
        grace_period = 120
        if isolated:
            grace_period = settings.AWX_ISOLATED_PERIODIC_CHECK * 2
        return self.modified < ref_time - timedelta(seconds=grace_period)

    def is_controller(self):
        return Instance.objects.filter(
            rampart_groups__controller__instances=self).exists()

    def is_isolated(self):
        return self.rampart_groups.filter(controller__isnull=False).exists()

    def refresh_capacity(self):
        cpu = get_cpu_capacity()
        mem = get_mem_capacity()
        if self.enabled:
            self.capacity = get_system_task_capacity(self.capacity_adjustment)
        else:
            self.capacity = 0
        self.cpu = cpu[0]
        self.memory = mem[0]
        self.cpu_capacity = cpu[1]
        self.mem_capacity = mem[1]
        self.version = awx_application_version
        self.save(update_fields=[
            'capacity', 'version', 'modified', 'cpu', 'memory', 'cpu_capacity',
            'mem_capacity'
        ])
Beispiel #3
0
Datei: ha.py Projekt: mahak/awx
class Instance(HasPolicyEditsMixin, BaseModel):
    """A model representing an AWX instance running against this database."""

    objects = InstanceManager()

    # Fields set in instance registration
    uuid = models.CharField(max_length=40, default=UUID_DEFAULT)
    hostname = models.CharField(max_length=250, unique=True)
    ip_address = models.CharField(
        blank=True,
        null=True,
        default=None,
        max_length=50,
        unique=True,
    )
    # Auto-fields, implementation is different from BaseModel
    created = models.DateTimeField(auto_now_add=True)
    modified = models.DateTimeField(auto_now=True)
    # Fields defined in health check or heartbeat
    version = models.CharField(max_length=120, blank=True)
    cpu = models.DecimalField(
        default=Decimal(0.0),
        max_digits=4,
        decimal_places=1,
        editable=False,
    )
    memory = models.BigIntegerField(
        default=0,
        editable=False,
        help_text=_('Total system memory of this instance in bytes.'),
    )
    errors = models.TextField(
        default='',
        blank=True,
        editable=False,
        help_text=_('Any error details from the last health check.'),
    )
    last_seen = models.DateTimeField(
        null=True,
        editable=False,
        help_text=
        _('Last time instance ran its heartbeat task for main cluster nodes. Last known connection to receptor mesh for execution nodes.'
          ),
    )
    last_health_check = models.DateTimeField(
        null=True,
        editable=False,
        help_text=
        _('Last time a health check was ran on this instance to refresh cpu, memory, and capacity.'
          ),
    )
    # Capacity management
    capacity = models.PositiveIntegerField(
        default=100,
        editable=False,
    )
    capacity_adjustment = models.DecimalField(
        default=Decimal(1.0),
        max_digits=3,
        decimal_places=2,
        validators=[MinValueValidator(0)])
    enabled = models.BooleanField(default=True)
    managed_by_policy = models.BooleanField(default=True)

    cpu_capacity = models.IntegerField(
        default=0,
        editable=False,
    )
    mem_capacity = models.IntegerField(
        default=0,
        editable=False,
    )
    NODE_TYPE_CHOICES = [
        ("control", "Control plane node"),
        ("execution", "Execution plane node"),
        ("hybrid", "Controller and execution"),
        ("hop", "Message-passing node, no execution capability"),
    ]
    node_type = models.CharField(default='hybrid',
                                 choices=NODE_TYPE_CHOICES,
                                 max_length=16)

    peers = models.ManyToManyField('self',
                                   symmetrical=False,
                                   through=InstanceLink,
                                   through_fields=('source', 'target'))

    class Meta:
        app_label = 'main'
        ordering = ("hostname", )

    POLICY_FIELDS = frozenset(
        ('managed_by_policy', 'hostname', 'capacity_adjustment'))

    def get_absolute_url(self, request=None):
        return reverse('api:instance_detail',
                       kwargs={'pk': self.pk},
                       request=request)

    @property
    def consumed_capacity(self):
        capacity_consumed = 0
        if self.node_type in ('hybrid', 'execution'):
            capacity_consumed += sum(x.task_impact
                                     for x in UnifiedJob.objects.filter(
                                         execution_node=self.hostname,
                                         status__in=('running', 'waiting')))
        if self.node_type in ('hybrid', 'control'):
            capacity_consumed += sum(settings.AWX_CONTROL_NODE_TASK_IMPACT
                                     for x in UnifiedJob.objects.filter(
                                         controller_node=self.hostname,
                                         status__in=('running', 'waiting')))
        return capacity_consumed

    @property
    def remaining_capacity(self):
        return self.capacity - self.consumed_capacity

    @property
    def jobs_running(self):
        return UnifiedJob.objects.filter(
            execution_node=self.hostname,
            status__in=(
                'running',
                'waiting',
            ),
        ).count()

    @property
    def jobs_total(self):
        return UnifiedJob.objects.filter(execution_node=self.hostname).count()

    def get_cleanup_task_kwargs(self, **kwargs):
        """
        Produce options to use for the command: ansible-runner worker cleanup
        returns a dict that is passed to the python interface for the runner method corresponding to that command
        any kwargs will override that key=value combination in the returned dict
        """
        vargs = dict()
        if settings.AWX_CLEANUP_PATHS:
            vargs['file_pattern'] = os.path.join(
                settings.AWX_ISOLATION_BASE_PATH,
                JOB_FOLDER_PREFIX % '*') + '*'
        vargs.update(kwargs)
        if not isinstance(vargs.get('grace_period'), int):
            vargs[
                'grace_period'] = 60  # grace period of 60 minutes, need to set because CLI default will not take effect
        if 'exclude_strings' not in vargs and vargs.get('file_pattern'):
            active_pks = list(
                UnifiedJob.objects.filter(
                    (models.Q(execution_node=self.hostname)
                     | models.Q(controller_node=self.hostname))
                    & models.Q(status__in=('running', 'waiting'))).values_list(
                        'pk', flat=True))
            if active_pks:
                vargs['exclude_strings'] = [
                    JOB_FOLDER_PREFIX % job_id for job_id in active_pks
                ]
        if 'remove_images' in vargs or 'image_prune' in vargs:
            vargs.setdefault('process_isolation_executable', 'podman')
        return vargs

    def is_lost(self, ref_time=None):
        if self.last_seen is None:
            return True
        if ref_time is None:
            ref_time = now()
        grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2
        if self.node_type in ('execution', 'hop'):
            grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD
        return self.last_seen < ref_time - timedelta(seconds=grace_period)

    def mark_offline(self,
                     update_last_seen=False,
                     perform_save=True,
                     errors=''):
        if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and self.errors == errors and (
                not update_last_seen):
            return
        self.cpu_capacity = self.mem_capacity = self.capacity = 0
        self.errors = errors
        if update_last_seen:
            self.last_seen = now()

        if perform_save:
            update_fields = [
                'capacity', 'cpu_capacity', 'mem_capacity', 'errors'
            ]
            if update_last_seen:
                update_fields += ['last_seen']
            self.save(update_fields=update_fields)

    def set_capacity_value(self):
        """Sets capacity according to capacity adjustment rule (no save)"""
        if self.enabled and self.node_type != 'hop':
            lower_cap = min(self.mem_capacity, self.cpu_capacity)
            higher_cap = max(self.mem_capacity, self.cpu_capacity)
            self.capacity = lower_cap + (higher_cap -
                                         lower_cap) * self.capacity_adjustment
        else:
            self.capacity = 0

    def refresh_capacity_fields(self):
        """Update derived capacity fields from cpu and memory (no save)"""
        if self.node_type == 'hop':
            self.cpu_capacity = 0
            self.mem_capacity = 0  # formula has a non-zero offset, so we make sure it is 0 for hop nodes
        else:
            self.cpu_capacity = get_cpu_effective_capacity(self.cpu)
            self.mem_capacity = get_mem_effective_capacity(self.memory)
        self.set_capacity_value()

    def save_health_data(self,
                         version=None,
                         cpu=0,
                         memory=0,
                         uuid=None,
                         update_last_seen=False,
                         errors=''):
        update_fields = ['errors']
        if self.node_type != 'hop':
            self.last_health_check = now()
            update_fields.append('last_health_check')

        if update_last_seen:
            self.last_seen = self.last_health_check
            update_fields.append('last_seen')

        if uuid is not None and self.uuid != uuid:
            if self.uuid is not None:
                logger.warning(
                    f'Self-reported uuid of {self.hostname} changed from {self.uuid} to {uuid}'
                )
            self.uuid = uuid
            update_fields.append('uuid')

        if version is not None and self.version != version:
            self.version = version
            update_fields.append('version')

        new_cpu = get_corrected_cpu(cpu)
        if new_cpu != self.cpu:
            self.cpu = new_cpu
            update_fields.append('cpu')

        new_memory = get_corrected_memory(memory)
        if new_memory != self.memory:
            self.memory = new_memory
            update_fields.append('memory')

        if not errors:
            self.refresh_capacity_fields()
            self.errors = ''
        else:
            self.mark_offline(perform_save=False, errors=errors)
        update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])

        # disabling activity stream will avoid extra queries, which is important for heatbeat actions
        from awx.main.signals import disable_activity_stream

        with disable_activity_stream():
            self.save(update_fields=update_fields)

    def local_health_check(self):
        """Only call this method on the instance that this record represents"""
        errors = None
        try:
            # if redis is down for some reason, that means we can't persist
            # playbook event data; we should consider this a zero capacity event
            redis.Redis.from_url(settings.BROKER_URL).ping()
        except redis.ConnectionError:
            errors = _('Failed to connect ot Redis')

        self.save_health_data(awx_application_version,
                              get_cpu_count(),
                              get_mem_in_bytes(),
                              update_last_seen=True,
                              errors=errors)
Beispiel #4
0
class Instance(HasPolicyEditsMixin, BaseModel):
    """A model representing an AWX instance running against this database."""

    objects = InstanceManager()

    uuid = models.CharField(max_length=40)
    hostname = models.CharField(max_length=250, unique=True)
    ip_address = models.CharField(
        blank=True,
        null=True,
        default=None,
        max_length=50,
        unique=True,
    )
    created = models.DateTimeField(auto_now_add=True)
    modified = models.DateTimeField(auto_now=True)
    version = models.CharField(max_length=120, blank=True)
    capacity = models.PositiveIntegerField(
        default=100,
        editable=False,
    )
    capacity_adjustment = models.DecimalField(
        default=Decimal(1.0),
        max_digits=3,
        decimal_places=2,
        validators=[MinValueValidator(0)])
    enabled = models.BooleanField(default=True)
    managed_by_policy = models.BooleanField(default=True)
    cpu = models.IntegerField(
        default=0,
        editable=False,
    )
    memory = models.BigIntegerField(
        default=0,
        editable=False,
    )
    cpu_capacity = models.IntegerField(
        default=0,
        editable=False,
    )
    mem_capacity = models.IntegerField(
        default=0,
        editable=False,
    )
    NODE_TYPE_CHOICES = [("control", "Control plane node"),
                         ("execution", "Execution plane node"),
                         ("hybrid", "Controller and execution")]
    node_type = models.CharField(default='hybrid',
                                 choices=NODE_TYPE_CHOICES,
                                 max_length=16)

    class Meta:
        app_label = 'main'
        ordering = ("hostname", )

    POLICY_FIELDS = frozenset(
        ('managed_by_policy', 'hostname', 'capacity_adjustment'))

    def get_absolute_url(self, request=None):
        return reverse('api:instance_detail',
                       kwargs={'pk': self.pk},
                       request=request)

    @property
    def consumed_capacity(self):
        return sum(x.task_impact for x in UnifiedJob.objects.filter(
            execution_node=self.hostname, status__in=('running', 'waiting')))

    @property
    def remaining_capacity(self):
        return self.capacity - self.consumed_capacity

    @property
    def role(self):
        # NOTE: TODO: Likely to repurpose this once standalone ramparts are a thing
        return "awx"

    @property
    def jobs_running(self):
        return UnifiedJob.objects.filter(
            execution_node=self.hostname,
            status__in=(
                'running',
                'waiting',
            ),
        ).count()

    @property
    def jobs_total(self):
        return UnifiedJob.objects.filter(execution_node=self.hostname).count()

    def is_lost(self, ref_time=None):
        if ref_time is None:
            ref_time = now()
        grace_period = 120
        return self.modified < ref_time - timedelta(seconds=grace_period)

    def refresh_capacity(self):
        cpu = get_cpu_capacity()
        mem = get_mem_capacity()
        if self.enabled:
            self.capacity = get_system_task_capacity(self.capacity_adjustment)
        else:
            self.capacity = 0

        try:
            # if redis is down for some reason, that means we can't persist
            # playbook event data; we should consider this a zero capacity event
            redis.Redis.from_url(settings.BROKER_URL).ping()
        except redis.ConnectionError:
            self.capacity = 0

        self.cpu = cpu[0]
        self.memory = mem[0]
        self.cpu_capacity = cpu[1]
        self.mem_capacity = mem[1]
        self.version = awx_application_version
        self.save(update_fields=[
            'capacity', 'version', 'modified', 'cpu', 'memory', 'cpu_capacity',
            'mem_capacity'
        ])
Beispiel #5
0
class Instance(HasPolicyEditsMixin, BaseModel):
    """A model representing an AWX instance running against this database."""

    objects = InstanceManager()

    # Fields set in instance registration
    uuid = models.CharField(max_length=40)
    hostname = models.CharField(max_length=250, unique=True)
    ip_address = models.CharField(
        blank=True,
        null=True,
        default=None,
        max_length=50,
        unique=True,
    )
    # Auto-fields, implementation is different from BaseModel
    created = models.DateTimeField(auto_now_add=True)
    modified = models.DateTimeField(auto_now=True)
    # Fields defined in health check or heartbeat
    version = models.CharField(max_length=120, blank=True)
    cpu = models.IntegerField(
        default=0,
        editable=False,
    )
    memory = models.BigIntegerField(
        default=0,
        editable=False,
        help_text=_('Total system memory of this instance in bytes.'),
    )
    last_seen = models.DateTimeField(
        null=True,
        editable=False,
        help_text=
        _('Last time instance ran its heartbeat task for main cluster nodes. Last known connection to receptor mesh for execution nodes.'
          ),
    )
    # Capacity management
    capacity = models.PositiveIntegerField(
        default=100,
        editable=False,
    )
    capacity_adjustment = models.DecimalField(
        default=Decimal(1.0),
        max_digits=3,
        decimal_places=2,
        validators=[MinValueValidator(0)])
    enabled = models.BooleanField(default=True)
    managed_by_policy = models.BooleanField(default=True)

    cpu_capacity = models.IntegerField(
        default=0,
        editable=False,
    )
    mem_capacity = models.IntegerField(
        default=0,
        editable=False,
    )
    NODE_TYPE_CHOICES = [("control", "Control plane node"),
                         ("execution", "Execution plane node"),
                         ("hybrid", "Controller and execution")]
    node_type = models.CharField(default='hybrid',
                                 choices=NODE_TYPE_CHOICES,
                                 max_length=16)

    class Meta:
        app_label = 'main'
        ordering = ("hostname", )

    POLICY_FIELDS = frozenset(
        ('managed_by_policy', 'hostname', 'capacity_adjustment'))

    def get_absolute_url(self, request=None):
        return reverse('api:instance_detail',
                       kwargs={'pk': self.pk},
                       request=request)

    @property
    def consumed_capacity(self):
        return sum(x.task_impact for x in UnifiedJob.objects.filter(
            execution_node=self.hostname, status__in=('running', 'waiting')))

    @property
    def remaining_capacity(self):
        return self.capacity - self.consumed_capacity

    @property
    def jobs_running(self):
        return UnifiedJob.objects.filter(
            execution_node=self.hostname,
            status__in=(
                'running',
                'waiting',
            ),
        ).count()

    @property
    def jobs_total(self):
        return UnifiedJob.objects.filter(execution_node=self.hostname).count()

    @staticmethod
    def choose_online_control_plane_node():
        return random.choice(
            Instance.objects.filter(enabled=True).filter(
                node_type__in=['control', 'hybrid']).values_list('hostname',
                                                                 flat=True))

    def is_lost(self, ref_time=None):
        if self.last_seen is None:
            return True
        if ref_time is None:
            ref_time = now()
        grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2
        if self.node_type == 'execution':
            grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD
        return self.last_seen < ref_time - timedelta(seconds=grace_period)

    def mark_offline(self, update_last_seen=False, perform_save=True):
        if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and (
                not update_last_seen):
            return
        self.cpu_capacity = self.mem_capacity = self.capacity = 0
        if update_last_seen:
            self.last_seen = now()

        if perform_save:
            update_fields = ['capacity', 'cpu_capacity', 'mem_capacity']
            if update_last_seen:
                update_fields += ['last_seen']
            self.save(update_fields=update_fields)

    def set_capacity_value(self):
        """Sets capacity according to capacity adjustment rule (no save)"""
        if self.enabled:
            lower_cap = min(self.mem_capacity, self.cpu_capacity)
            higher_cap = max(self.mem_capacity, self.cpu_capacity)
            self.capacity = lower_cap + (higher_cap -
                                         lower_cap) * self.capacity_adjustment
        else:
            self.capacity = 0

    def refresh_capacity_fields(self):
        """Update derived capacity fields from cpu and memory (no save)"""
        self.cpu_capacity = get_cpu_effective_capacity(self.cpu)
        self.mem_capacity = get_mem_effective_capacity(self.memory)
        self.set_capacity_value()

    def save_health_data(self,
                         version,
                         cpu,
                         memory,
                         uuid=None,
                         last_seen=None,
                         has_error=False):
        update_fields = []

        if last_seen is not None and self.last_seen != last_seen:
            self.last_seen = last_seen
            update_fields.append('last_seen')

        if uuid is not None and self.uuid != uuid:
            if self.uuid is not None:
                logger.warn(
                    f'Self-reported uuid of {self.hostname} changed from {self.uuid} to {uuid}'
                )
            self.uuid = uuid
            update_fields.append('uuid')

        if self.version != version:
            self.version = version
            update_fields.append('version')

        new_cpu = get_corrected_cpu(cpu)
        if new_cpu != self.cpu:
            self.cpu = new_cpu
            update_fields.append('cpu')

        new_memory = get_corrected_memory(memory)
        if new_memory != self.memory:
            self.memory = new_memory
            update_fields.append('memory')

        if not has_error:
            self.refresh_capacity_fields()
        else:
            self.mark_offline(perform_save=False)
        update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])

        self.save(update_fields=update_fields)

    def local_health_check(self):
        """Only call this method on the instance that this record represents"""
        has_error = False
        try:
            # if redis is down for some reason, that means we can't persist
            # playbook event data; we should consider this a zero capacity event
            redis.Redis.from_url(settings.BROKER_URL).ping()
        except redis.ConnectionError:
            has_error = True

        self.save_health_data(awx_application_version,
                              get_cpu_count(),
                              get_mem_in_bytes(),
                              last_seen=now(),
                              has_error=has_error)