def register(self, uuid=None, hostname=None, ip_address=None): if not uuid: uuid = settings.SYSTEM_UUID if not hostname: hostname = settings.CLUSTER_HOST_ID with advisory_lock('instance_registration_%s' % hostname): if settings.AWX_AUTO_DEPROVISION_INSTANCES: # detect any instances with the same IP address. # if one exists, set it to None inst_conflicting_ip = self.filter(ip_address=ip_address).exclude(hostname=hostname) if inst_conflicting_ip.exists(): for other_inst in inst_conflicting_ip: other_hostname = other_inst.hostname other_inst.ip_address = None other_inst.save(update_fields=['ip_address']) logger.warning("IP address {0} conflict detected, ip address unset for host {1}.".format(ip_address, other_hostname)) instance = self.filter(hostname=hostname) if instance.exists(): instance = instance.get() if instance.ip_address != ip_address: instance.ip_address = ip_address instance.save(update_fields=['ip_address']) return (True, instance) else: return (False, instance) instance = self.create(uuid=uuid, hostname=hostname, ip_address=ip_address, capacity=0) return (True, instance)
def migrate_json_fields(table, expensive, columns): logger.warning(f"Migrating json fields: {table} {columns}") with advisory_lock(f'json_migration_{table}', wait=False) as acquired: if not acquired: return from django.db.migrations.executor import MigrationExecutor # If Django is currently running migrations, wait until it is done. while True: executor = MigrationExecutor(connection) if not executor.migration_plan(executor.loader.graph.leaf_nodes()): break time.sleep(60) if expensive: migrate_json_fields_expensive(table, columns) else: with connection.cursor() as cursor: column_expr = " ".join(f"ALTER {colname} TYPE jsonb" for colname in columns) cursor.execute(f"ALTER TABLE {table} {column_expr};") logger.warning(f"Migration of {table} to jsonb is finished")
def handle(self, *args, **options): # TODO: remove in 3.3 if options.get('name'): warnings.warn( "`--name` is depreciated in favor of `--hostname`, and will be removed in release 3.3." ) if options.get('hostname'): raise CommandError("Cannot accept both --name and --hostname.") options['hostname'] = options['name'] hostname = options.get('hostname') if not hostname: raise CommandError("--hostname is a required argument") with advisory_lock('instance_registration_%s' % hostname): instance = Instance.objects.filter(hostname=hostname) if instance.exists(): instance.delete() print("Instance Removed") result = subprocess.Popen( "rabbitmqctl forget_cluster_node rabbitmq@{}".format( hostname), shell=True).wait() if result != 0: print( "Node deprovisioning may have failed when attempting to " "remove the RabbitMQ instance {} from the cluster". format(hostname)) else: print('Successfully deprovisioned {}'.format(hostname)) print('(changed: True)') else: print('No instance found matching name {}'.format(hostname))
def handle(self, *args, **options): # TODO: remove in 3.3 hostname = options.get('hostname') if not hostname: raise CommandError("--hostname is a required argument") with advisory_lock('instance_registration_%s' % hostname): instance = Instance.objects.filter(hostname=hostname) if instance.exists(): isolated = instance.first().is_isolated() instance.delete() print("Instance Removed") if isolated: print('Successfully deprovisioned {}'.format(hostname)) else: result = subprocess.Popen( "rabbitmqctl forget_cluster_node rabbitmq@{}".format( hostname), shell=True).wait() if result != 0: print( "Node deprovisioning may have failed when attempting to " "remove the RabbitMQ instance {} from the cluster". format(hostname)) else: print('Successfully deprovisioned {}'.format(hostname)) print('(changed: True)') else: print('No instance found matching name {}'.format(hostname))
def register(self): with advisory_lock('cluster_policy_lock'): with transaction.atomic(): changed2 = False changed3 = False (ig, created, changed1) = self.get_create_update_instance_group() if created: print("Creating instance group {}".format(ig.name)) elif not created: print("Instance Group already registered {}".format(ig.name)) if self.controller: (ig_ctrl, changed2) = self.update_instance_group_controller(ig) if changed2: print("Set controller group {} on {}.".format(self.controller, self.queuename)) try: (instances, changed3) = self.add_instances_to_group(ig) for i in instances: print("Added instance {} to {}".format(i.hostname, ig.name)) except InstanceNotFound as e: self.instance_not_found_err = e if any([changed1, changed2, changed3]): print('(changed: True)')
def inspect_execution_nodes(instance_list): with advisory_lock('inspect_execution_nodes_lock', wait=False): node_lookup = {inst.hostname: inst for inst in instance_list} ctl = get_receptor_ctl() mesh_status = ctl.simple_command('status') nowtime = now() workers = mesh_status['Advertisements'] for ad in workers: hostname = ad['NodeID'] if hostname in node_lookup: instance = node_lookup[hostname] else: logger.warning( f"Unrecognized node advertising on mesh: {hostname}") continue # Control-plane nodes are dealt with via local_health_check instead. if instance.node_type in ('control', 'hybrid'): continue was_lost = instance.is_lost(ref_time=nowtime) last_seen = parse_date(ad['Time']) if instance.last_seen and instance.last_seen >= last_seen: continue instance.last_seen = last_seen instance.save(update_fields=['last_seen']) # Only execution nodes should be dealt with by execution_node_health_check if instance.node_type == 'hop': if was_lost and (not instance.is_lost(ref_time=nowtime)): logger.warning( f'Hop node {hostname}, has rejoined the receptor mesh') instance.save_health_data(errors='') continue if was_lost: # if the instance *was* lost, but has appeared again, # attempt to re-establish the initial capacity and version # check logger.warning( f'Execution node attempting to rejoin as instance {hostname}.' ) execution_node_health_check.apply_async([hostname]) elif instance.capacity == 0 and instance.enabled: # nodes with proven connection but need remediation run health checks are reduced frequency if not instance.last_health_check or ( nowtime - instance.last_health_check).total_seconds( ) >= settings.EXECUTION_NODE_REMEDIATION_CHECKS: # Periodically re-run the health check of errored nodes, in case someone fixed it # TODO: perhaps decrease the frequency of these checks logger.debug( f'Restarting health check for execution node {hostname} with known errors.' ) execution_node_health_check.apply_async([hostname])
def awx_periodic_scheduler(): with advisory_lock('awx_periodic_scheduler_lock', wait=False) as acquired: if acquired is False: logger.debug("Not running periodic scheduler, another task holds lock") return logger.debug("Starting periodic scheduler") run_now = now() state = TowerScheduleState.get_solo() last_run = state.schedule_last_run logger.debug("Last scheduler run was: %s", last_run) state.schedule_last_run = run_now state.save() old_schedules = Schedule.objects.enabled().before(last_run) for schedule in old_schedules: schedule.update_computed_fields() schedules = Schedule.objects.enabled().between(last_run, run_now) invalid_license = False try: access_registry[Job](None).check_license(quiet=True) except PermissionDenied as e: invalid_license = e for schedule in schedules: template = schedule.unified_job_template schedule.update_computed_fields() # To update next_run timestamp. if template.cache_timeout_blocked: logger.warning("Cache timeout is in the future, bypassing schedule for template %s" % str(template.id)) continue try: job_kwargs = schedule.get_job_kwargs() new_unified_job = schedule.unified_job_template.create_unified_job(**job_kwargs) logger.debug('Spawned {} from schedule {}-{}.'.format(new_unified_job.log_format, schedule.name, schedule.pk)) if invalid_license: new_unified_job.status = 'failed' new_unified_job.job_explanation = str(invalid_license) new_unified_job.save(update_fields=['status', 'job_explanation']) new_unified_job.websocket_emit_status("failed") raise invalid_license can_start = new_unified_job.signal_start() except Exception: logger.exception('Error spawning scheduled job.') continue if not can_start: new_unified_job.status = 'failed' new_unified_job.job_explanation = gettext_noop( "Scheduled job could not start because it \ was not in the right state or required manual credentials" ) new_unified_job.save(update_fields=['status', 'job_explanation']) new_unified_job.websocket_emit_status("failed") emit_channel_notification('schedules-changed', dict(id=schedule.id, group_name="schedules")) state.save()
def schedule(self): # Lock with advisory_lock('task_manager_lock', wait=False) as acquired: with transaction.atomic(): if acquired is False: logger.debug("Not running scheduler, another task holds lock") return logger.debug("Starting Scheduler") with task_manager_bulk_reschedule(): self._schedule()
def register(self, uuid=None, hostname=None, ip_address=None, node_type='hybrid', defaults=None): if not hostname: hostname = settings.CLUSTER_HOST_ID with advisory_lock('instance_registration_%s' % hostname): if settings.AWX_AUTO_DEPROVISION_INSTANCES: # detect any instances with the same IP address. # if one exists, set it to None inst_conflicting_ip = self.filter(ip_address=ip_address).exclude(hostname=hostname) if inst_conflicting_ip.exists(): for other_inst in inst_conflicting_ip: other_hostname = other_inst.hostname other_inst.ip_address = None other_inst.save(update_fields=['ip_address']) logger.warning("IP address {0} conflict detected, ip address unset for host {1}.".format(ip_address, other_hostname)) # Return existing instance that matches hostname or UUID (default to UUID) if uuid is not None and uuid != UUID_DEFAULT and self.filter(uuid=uuid).exists(): instance = self.filter(uuid=uuid) else: # if instance was not retrieved by uuid and hostname was, use the hostname instance = self.filter(hostname=hostname) # Return existing instance if instance.exists(): instance = instance.first() # in the unusual occasion that there is more than one, only get one update_fields = [] # if instance was retrieved by uuid and hostname has changed, update hostname if instance.hostname != hostname: logger.warning("passed in hostname {0} is different from the original hostname {1}, updating to {0}".format(hostname, instance.hostname)) instance.hostname = hostname update_fields.append('hostname') # if any other fields are to be updated if instance.ip_address != ip_address: instance.ip_address = ip_address if instance.node_type != node_type: instance.node_type = node_type update_fields.append('node_type') if update_fields: instance.save(update_fields=update_fields) return (True, instance) else: return (False, instance) # Create new instance, and fill in default values create_defaults = dict(capacity=0) if defaults is not None: create_defaults.update(defaults) uuid_option = {} if uuid is not None: uuid_option = dict(uuid=uuid) if node_type == 'execution' and 'version' not in create_defaults: create_defaults['version'] = RECEPTOR_PENDING instance = self.create(hostname=hostname, ip_address=ip_address, node_type=node_type, **create_defaults, **uuid_option) return (True, instance)
def register(self, uuid=None, hostname=None, ip_address=None, node_type='hybrid', defaults=None): if not hostname: hostname = settings.CLUSTER_HOST_ID with advisory_lock('instance_registration_%s' % hostname): if settings.AWX_AUTO_DEPROVISION_INSTANCES: # detect any instances with the same IP address. # if one exists, set it to None inst_conflicting_ip = self.filter( ip_address=ip_address).exclude(hostname=hostname) if inst_conflicting_ip.exists(): for other_inst in inst_conflicting_ip: other_hostname = other_inst.hostname other_inst.ip_address = None other_inst.save(update_fields=['ip_address']) logger.warning( "IP address {0} conflict detected, ip address unset for host {1}." .format(ip_address, other_hostname)) # Return existing instance that matches hostname instance = self.filter(hostname=hostname) if instance.exists(): instance = instance.get() update_fields = [] if instance.ip_address != ip_address: instance.ip_address = ip_address update_fields.append('ip_address') if instance.node_type != node_type: instance.node_type = node_type update_fields.append('node_type') if update_fields: instance.save(update_fields=update_fields) return (True, instance) else: return (False, instance) # Create new instance, and fill in default values create_defaults = dict(capacity=0) if defaults is not None: create_defaults.update(defaults) uuid_option = {} if uuid is not None: uuid_option = dict(uuid=uuid) if node_type == 'execution' and 'version' not in create_defaults: create_defaults['version'] = RECEPTOR_PENDING instance = self.create(hostname=hostname, ip_address=ip_address, node_type=node_type, **create_defaults, **uuid_option) return (True, instance)
def handle(self, **options): queuename = options.get('queuename') if not queuename: raise CommandError("Specify `--queuename` to use this command.") changed = False with advisory_lock('instance_group_registration_%s' % queuename): ig = InstanceGroup.objects.filter(name=queuename) control_ig = None if options.get('controller'): control_ig = InstanceGroup.objects.filter( name=options.get('controller')).first() if ig.exists(): print("Instance Group already registered {}".format( ig[0].name)) ig = ig[0] if control_ig and ig.controller_id != control_ig.pk: ig.controller = control_ig ig.save() print("Set controller group {} on {}.".format( control_ig.name, ig.name)) changed = True else: print("Creating instance group {}".format(queuename)) ig = InstanceGroup( name=queuename, policy_instance_percentage=options.get('instance_percent'), policy_instance_minimum=options.get('instance_minimum')) if control_ig: ig.controller = control_ig ig.save() changed = True hostname_list = [] if options.get('hostnames'): hostname_list = options.get('hostnames').split(",") instance_list = [x.strip() for x in hostname_list if x] for inst_name in instance_list: instance = Instance.objects.filter(hostname=inst_name) if instance.exists() and instance[0] not in ig.instances.all(): ig.instances.add(instance[0]) print("Added instance {} to {}".format( instance[0].hostname, ig.name)) changed = True elif not instance.exists(): print("Instance does not exist: {}".format(inst_name)) if changed: print('(changed: True)') sys.exit(1) else: print("Instance already registered {}".format( instance[0].hostname)) ig.policy_instance_list = instance_list ig.save() if changed: print('(changed: True)')
def register(self, uuid=None, hostname=None): if not uuid: uuid = settings.SYSTEM_UUID if not hostname: hostname = settings.CLUSTER_HOST_ID with advisory_lock('instance_registration_%s' % hostname): instance = self.filter(hostname=hostname) if instance.exists(): return (False, instance[0]) instance = self.create(uuid=uuid, hostname=hostname, capacity=0) return (True, instance)
def handle(self, **options): instance_not_found_err = None queuename = options.get('queuename') if not queuename: raise CommandError("Specify `--queuename` to use this command.") ctrl = options.get('controller') inst_per = options.get('instance_percent') inst_min = options.get('instance_minimum') hostname_list = [] if options.get('hostnames'): hostname_list = options.get('hostnames').split(",") with advisory_lock( six.text_type('instance_group_registration_{}').format( queuename)): changed2 = False changed3 = False (ig, created, changed1) = self.get_create_update_instance_group( queuename, inst_per, inst_min) if created: print( six.text_type("Creating instance group {}").format( ig.name)) elif not created: print( six.text_type( "Instance Group already registered {}").format( ig.name)) if ctrl: (ig_ctrl, changed2) = self.update_instance_group_controller(ig, ctrl) if changed2: print( six.text_type("Set controller group {} on {}.").format( ctrl, queuename)) try: (instances, changed3) = self.add_instances_to_group(ig, hostname_list) for i in instances: print( six.text_type("Added instance {} to {}").format( i.hostname, ig.name)) except InstanceNotFound as e: instance_not_found_err = e if any([changed1, changed2, changed3]): print('(changed: True)') if instance_not_found_err: print(instance_not_found_err.message) sys.exit(1)
def _register_hostname(self, hostname): if not hostname: return with advisory_lock('instance_registration_%s' % hostname): instance = Instance.objects.filter(hostname=hostname) if instance.exists(): print("Instance already registered {}".format( instance[0].hostname)) return instance = Instance(uuid=self.uuid, hostname=hostname) instance.save() print('Successfully registered instance {}'.format(hostname)) self.changed = True
def handle(self, **options): queuename = options.get('queuename') if not queuename: raise CommandError('Must specify `--queuename` in order to use command.') with advisory_lock('instance_group_registration_%s' % queuename): ig = InstanceGroup.objects.filter(name=queuename) if not ig.exists(): print("Instance group doesn't exist") sys.exit(1) ig = ig.first() ig.delete() print("Instance Group Removed") print('(changed: True)')
def schedule(self): # Lock with advisory_lock('task_manager_lock', wait=False) as acquired: with transaction.atomic(): if acquired is False: logger.debug("Not running scheduler, another task holds lock") return logger.debug("Starting Scheduler") finished_wfjs = self._schedule() # Operations whose queries rely on modifications made during the atomic scheduling session for wfj in WorkflowJob.objects.filter(id__in=finished_wfjs): wfj.send_notification_templates('succeeded' if wfj.status == 'successful' else 'failed')
def handle(self, *args, **options): # TODO: remove in 3.3 hostname = options.get('hostname') if not hostname: raise CommandError("--hostname is a required argument") with advisory_lock('instance_registration_%s' % hostname): instance = Instance.objects.filter(hostname=hostname) if instance.exists(): instance.delete() print("Instance Removed") print('Successfully deprovisioned {}'.format(hostname)) print('(changed: True)') else: print('No instance found matching name {}'.format(hostname))
def schedule(self): # Lock with advisory_lock('task_manager_lock', wait=False) as acquired: with transaction.atomic(): if acquired is False: logger.debug("Not running scheduler, another task holds lock") return logger.debug("Starting Scheduler") with task_manager_bulk_reschedule(): # if sigterm due to timeout, still record metrics signal.signal(signal.SIGTERM, self.record_aggregate_metrics_and_exit) self._schedule() self.record_aggregate_metrics() logger.debug("Finishing Scheduler")
def register(self): with advisory_lock('cluster_policy_lock'): with transaction.atomic(): changed2 = False (ig, created, changed1) = self.get_create_update_instance_group() if created: print("Creating instance group {}".format(ig.name)) elif not created: print("Instance Group already registered {}".format(ig.name)) try: (instances, changed2) = self.add_instances_to_group(ig) for i in instances: print("Added instance {} to {}".format(i.hostname, ig.name)) except InstanceNotFound as e: self.instance_not_found_err = e if changed1 or changed2: print('(changed: True)')
def register(self, uuid=None, hostname=None, ip_address=None): if not uuid: uuid = settings.SYSTEM_UUID if not hostname: hostname = settings.CLUSTER_HOST_ID with advisory_lock('instance_registration_%s' % hostname): instance = self.filter(hostname=hostname) if instance.exists(): instance = instance.get() if instance.ip_address != ip_address: instance.ip_address = ip_address instance.save(update_fields=['ip_address']) return (True, instance) else: return (False, instance) instance = self.create(uuid=uuid, hostname=hostname, ip_address=ip_address, capacity=0) return (True, instance)
def schedule(self): # Lock with task_manager_bulk_reschedule(): with advisory_lock(f"{self.prefix}_lock", wait=False) as acquired: with transaction.atomic(): if acquired is False: logger.debug( f"Not running {self.prefix} scheduler, another task holds lock" ) return logger.debug(f"Starting {self.prefix} Scheduler") # if sigterm due to timeout, still record metrics signal.signal(signal.SIGTERM, self.record_aggregate_metrics_and_exit) self._schedule() commit_start = time.time() if self.prefix == "task_manager": self.subsystem_metrics.set(f"{self.prefix}_commit_seconds", time.time() - commit_start) self.record_aggregate_metrics() logger.debug(f"Finishing {self.prefix} Scheduler")
def apply_cluster_membership_policies(): from awx.main.signals import disable_activity_stream started_waiting = time.time() with advisory_lock('cluster_policy_lock', wait=True): lock_time = time.time() - started_waiting if lock_time > 1.0: to_log = logger.info else: to_log = logger.debug to_log('Waited {} seconds to obtain lock name: cluster_policy_lock'.format(lock_time)) started_compute = time.time() # Hop nodes should never get assigned to an InstanceGroup. all_instances = list(Instance.objects.exclude(node_type='hop').order_by('id')) all_groups = list(InstanceGroup.objects.prefetch_related('instances')) total_instances = len(all_instances) actual_groups = [] actual_instances = [] Group = namedtuple('Group', ['obj', 'instances', 'prior_instances']) Node = namedtuple('Instance', ['obj', 'groups']) # Process policy instance list first, these will represent manually managed memberships instance_hostnames_map = {inst.hostname: inst for inst in all_instances} for ig in all_groups: group_actual = Group(obj=ig, instances=[], prior_instances=[instance.pk for instance in ig.instances.all()]) # obtained in prefetch for hostname in ig.policy_instance_list: if hostname not in instance_hostnames_map: logger.info("Unknown instance {} in {} policy list".format(hostname, ig.name)) continue inst = instance_hostnames_map[hostname] group_actual.instances.append(inst.id) # NOTE: arguable behavior: policy-list-group is not added to # instance's group count for consideration in minimum-policy rules if group_actual.instances: logger.debug("Policy List, adding Instances {} to Group {}".format(group_actual.instances, ig.name)) actual_groups.append(group_actual) # Process Instance minimum policies next, since it represents a concrete lower bound to the # number of instances to make available to instance groups actual_instances = [Node(obj=i, groups=[]) for i in all_instances if i.managed_by_policy] logger.debug("Total instances: {}, available for policy: {}".format(total_instances, len(actual_instances))) for g in sorted(actual_groups, key=lambda x: len(x.instances)): exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control' policy_min_added = [] for i in sorted(actual_instances, key=lambda x: len(x.groups)): if i.obj.node_type == exclude_type: continue # never place execution instances in controlplane group or control instances in other groups if len(g.instances) >= g.obj.policy_instance_minimum: break if i.obj.id in g.instances: # If the instance is already _in_ the group, it was # applied earlier via the policy list continue g.instances.append(i.obj.id) i.groups.append(g.obj.id) policy_min_added.append(i.obj.id) if policy_min_added: logger.debug("Policy minimum, adding Instances {} to Group {}".format(policy_min_added, g.obj.name)) # Finally, process instance policy percentages for g in sorted(actual_groups, key=lambda x: len(x.instances)): exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control' candidate_pool_ct = sum(1 for i in actual_instances if i.obj.node_type != exclude_type) if not candidate_pool_ct: continue policy_per_added = [] for i in sorted(actual_instances, key=lambda x: len(x.groups)): if i.obj.node_type == exclude_type: continue if i.obj.id in g.instances: # If the instance is already _in_ the group, it was # applied earlier via a minimum policy or policy list continue if 100 * float(len(g.instances)) / candidate_pool_ct >= g.obj.policy_instance_percentage: break g.instances.append(i.obj.id) i.groups.append(g.obj.id) policy_per_added.append(i.obj.id) if policy_per_added: logger.debug("Policy percentage, adding Instances {} to Group {}".format(policy_per_added, g.obj.name)) # Determine if any changes need to be made needs_change = False for g in actual_groups: if set(g.instances) != set(g.prior_instances): needs_change = True break if not needs_change: logger.debug('Cluster policy no-op finished in {} seconds'.format(time.time() - started_compute)) return # On a differential basis, apply instances to groups with transaction.atomic(): with disable_activity_stream(): for g in actual_groups: if g.obj.is_container_group: logger.debug('Skipping containerized group {} for policy calculation'.format(g.obj.name)) continue instances_to_add = set(g.instances) - set(g.prior_instances) instances_to_remove = set(g.prior_instances) - set(g.instances) if instances_to_add: logger.debug('Adding instances {} to group {}'.format(list(instances_to_add), g.obj.name)) g.obj.instances.add(*instances_to_add) if instances_to_remove: logger.debug('Removing instances {} from group {}'.format(list(instances_to_remove), g.obj.name)) g.obj.instances.remove(*instances_to_remove) logger.debug('Cluster policy computation finished in {} seconds'.format(time.time() - started_compute))
def perform_update(self, options, data, inventory_update): """Shared method for both awx-manage CLI updates and inventory updates from the tasks system. This saves the inventory data to the database, calling load_into_database but also wraps that method in a host of options processing """ # outside of normal options, these are needed as part of programatic interface self.inventory = inventory_update.inventory self.inventory_source = inventory_update.inventory_source self.inventory_update = inventory_update # the update options, could be parser object or dict self.overwrite = bool(options.get('overwrite', False)) self.overwrite_vars = bool(options.get('overwrite_vars', False)) self.enabled_var = options.get('enabled_var', None) self.enabled_value = options.get('enabled_value', None) self.group_filter = options.get('group_filter', None) or r'^.+$' self.host_filter = options.get('host_filter', None) or r'^.+$' self.exclude_empty_groups = bool(options.get('exclude_empty_groups', False)) self.instance_id_var = options.get('instance_id_var', None) try: self.group_filter_re = re.compile(self.group_filter) except re.error: raise CommandError('invalid regular expression for --group-filter') try: self.host_filter_re = re.compile(self.host_filter) except re.error: raise CommandError('invalid regular expression for --host-filter') begin = time.time() # Since perform_update can be invoked either through the awx-manage CLI # or from the task system, we need to create a new lock at this level # (even though inventory_import.Command.handle -- which calls # perform_update -- has its own lock, inventory_ID_import) with advisory_lock('inventory_{}_perform_update'.format(self.inventory.id)): try: self.check_license() except PermissionDenied as e: self.mark_license_failure(save=True) raise e try: # Check the per-org host limits self.check_org_host_limit() except PermissionDenied as e: self.mark_org_limits_failure(save=True) raise e if settings.SQL_DEBUG: queries_before = len(connection.queries) # Update inventory update for this command line invocation. with ignore_inventory_computed_fields(): # TODO: move this to before perform_update iu = self.inventory_update if iu.status != 'running': with transaction.atomic(): self.inventory_update.status = 'running' self.inventory_update.save() logger.info('Processing JSON output...') inventory = MemInventory(group_filter_re=self.group_filter_re, host_filter_re=self.host_filter_re) inventory = dict_to_mem_data(data, inventory=inventory) logger.info('Loaded %d groups, %d hosts', len(inventory.all_group.all_groups), len(inventory.all_group.all_hosts)) if self.exclude_empty_groups: inventory.delete_empty_groups() self.all_group = inventory.all_group if settings.DEBUG: # depending on inventory source, this output can be # *exceedingly* verbose - crawling a deeply nested # inventory/group data structure and printing metadata about # each host and its memberships # # it's easy for this scale of data to overwhelm pexpect, # (and it's likely only useful for purposes of debugging the # actual inventory import code), so only print it if we have to: # https://github.com/ansible/ansible-tower/issues/7414#issuecomment-321615104 self.all_group.debug_tree() with batch_role_ancestor_rebuilding(): # If using with transaction.atomic() with try ... catch, # with transaction.atomic() must be inside the try section of the code as per Django docs try: # Ensure that this is managed as an atomic SQL transaction, # and thus properly rolled back if there is an issue. with transaction.atomic(): # Merge/overwrite inventory into database. if settings.SQL_DEBUG: logger.warning('loading into database...') with ignore_inventory_computed_fields(): if getattr(settings, 'ACTIVITY_STREAM_ENABLED_FOR_INVENTORY_SYNC', True): self.load_into_database() else: with disable_activity_stream(): self.load_into_database() if settings.SQL_DEBUG: queries_before2 = len(connection.queries) self.inventory.update_computed_fields() if settings.SQL_DEBUG: logger.warning('update computed fields took %d queries', len(connection.queries) - queries_before2) # Check if the license is valid. # If the license is not valid, a CommandError will be thrown, # and inventory update will be marked as invalid. # with transaction.atomic() will roll back the changes. license_fail = True self.check_license() # Check the per-org host limits license_fail = False self.check_org_host_limit() except PermissionDenied as e: if license_fail: self.mark_license_failure(save=True) else: self.mark_org_limits_failure(save=True) raise e if settings.SQL_DEBUG: logger.warning('Inventory import completed for %s in %0.1fs', self.inventory_source.name, time.time() - begin) else: logger.info('Inventory import completed for %s in %0.1fs', self.inventory_source.name, time.time() - begin) # If we're in debug mode, then log the queries and time # used to do the operation. if settings.SQL_DEBUG: queries_this_import = connection.queries[queries_before:] sqltime = sum(float(x['time']) for x in queries_this_import) logger.warning('Inventory import required %d queries ' 'taking %0.3fs', len(queries_this_import), sqltime)
def gather(dest=None, module=None, subset=None, since=None, until=None, collection_type='scheduled'): """ Gather all defined metrics and write them as JSON files in a .tgz :param dest: the (optional) absolute path to write a compressed tarball :param module: the module to search for registered analytic collector functions; defaults to awx.main.analytics.collectors """ log_level = logging.ERROR if collection_type != 'scheduled' else logging.DEBUG if not _valid_license(): logger.log(log_level, "Invalid License provided, or No License Provided") return None if collection_type != 'dry-run': if not settings.INSIGHTS_TRACKING_STATE: logger.log(log_level, "Automation Analytics not enabled. Use --dry-run to gather locally without sending.") return None if not (settings.AUTOMATION_ANALYTICS_URL and settings.REDHAT_USERNAME and settings.REDHAT_PASSWORD): logger.log(log_level, "Not gathering analytics, configuration is invalid. Use --dry-run to gather locally without sending.") return None with advisory_lock('gather_analytics_lock', wait=False) as acquired: if not acquired: logger.log(log_level, "Not gathering analytics, another task holds lock") return None from awx.conf.models import Setting from awx.main.analytics import collectors from awx.main.signals import disable_activity_stream logger.debug("Last analytics run was: {}".format(settings.AUTOMATION_ANALYTICS_LAST_GATHER)) try: since, until, last_gather = calculate_collection_interval(since, until) except ValueError: return None last_entries = Setting.objects.filter(key='AUTOMATION_ANALYTICS_LAST_ENTRIES').first() last_entries = json.loads((last_entries.value if last_entries is not None else '') or '{}', object_hook=datetime_hook) collector_module = module if module else collectors collector_list = [ func for name, func in inspect.getmembers(collector_module) if inspect.isfunction(func) and hasattr(func, '__awx_analytics_key__') and (not subset or name in subset) ] if not any(c.__awx_analytics_key__ == 'config' for c in collector_list): # In order to ship to analytics, we must include the output of the built-in 'config' collector. collector_list.append(collectors.config) json_collectors = [func for func in collector_list if func.__awx_analytics_type__ == 'json'] csv_collectors = [func for func in collector_list if func.__awx_analytics_type__ == 'csv'] dest = pathlib.Path(dest or tempfile.mkdtemp(prefix='awx_analytics')) gather_dir = dest.joinpath('stage') gather_dir.mkdir(mode=0o700) tarfiles = [] succeeded = True # These json collectors are pretty compact, so collect all of them before shipping to analytics. data = {} for func in json_collectors: key = func.__awx_analytics_key__ filename = f'{key}.json' try: last_entry = max(last_entries.get(key) or last_gather, until - timedelta(weeks=4)) results = (func(since or last_entry, collection_type=collection_type, until=until), func.__awx_analytics_version__) json.dumps(results, cls=DjangoJSONEncoder) # throwaway check to see if the data is json-serializable data[filename] = results except Exception: logger.exception("Could not generate metric {}".format(filename)) if data: if data.get('config.json') is None: logger.error("'config' collector data is missing.") return None tgzfile = package(dest.parent, data, until) if tgzfile is not None: tarfiles.append(tgzfile) if collection_type != 'dry-run': if ship(tgzfile): with disable_activity_stream(): for filename in data: key = filename.replace('.json', '') last_entries[key] = max(last_entries[key], until) if last_entries.get(key) else until settings.AUTOMATION_ANALYTICS_LAST_ENTRIES = json.dumps(last_entries, cls=DjangoJSONEncoder) else: succeeded = False for func in csv_collectors: key = func.__awx_analytics_key__ filename = f'{key}.csv' try: # These slicer functions may return a generator. The `since` parameter is # allowed to be None, and will fall back to LAST_ENTRIES[key] or to # LAST_GATHER (truncated appropriately to match the 4-week limit). if func.__awx_expensive__: slices = func.__awx_expensive__(key, since, until, last_gather) else: slices = collectors.trivial_slicing(key, since, until, last_gather) for start, end in slices: files = func(start, full_path=gather_dir, until=end) if not files: if collection_type != 'dry-run': with disable_activity_stream(): entry = last_entries.get(key) last_entries[key] = max(entry, end) if entry and type(entry) == type(end) else end settings.AUTOMATION_ANALYTICS_LAST_ENTRIES = json.dumps(last_entries, cls=DjangoJSONEncoder) continue slice_succeeded = True for fpath in files: payload = {filename: (fpath, func.__awx_analytics_version__)} payload['config.json'] = data.get('config.json') if payload['config.json'] is None: logger.error("'config' collector data is missing, and is required to ship.") return None tgzfile = package(dest.parent, payload, until) if tgzfile is not None: tarfiles.append(tgzfile) if collection_type != 'dry-run': if not ship(tgzfile): slice_succeeded, succeeded = False, False break if slice_succeeded and collection_type != 'dry-run': with disable_activity_stream(): entry = last_entries.get(key) last_entries[key] = max(entry, end) if entry and type(entry) == type(end) else end settings.AUTOMATION_ANALYTICS_LAST_ENTRIES = json.dumps(last_entries, cls=DjangoJSONEncoder) except Exception: succeeded = False logger.exception("Could not generate metric {}".format(filename)) if collection_type != 'dry-run': if succeeded: for fpath in tarfiles: if os.path.exists(fpath): os.remove(fpath) with disable_activity_stream(): if not settings.AUTOMATION_ANALYTICS_LAST_GATHER or until > settings.AUTOMATION_ANALYTICS_LAST_GATHER: # `AUTOMATION_ANALYTICS_LAST_GATHER` is set whether collection succeeds or fails; # if collection fails because of a persistent, underlying issue and we do not set last_gather, # we risk the collectors hitting an increasingly greater workload while the underlying issue # remains unresolved. Put simply, if collection fails, we just move on. # All that said, `AUTOMATION_ANALYTICS_LAST_GATHER` plays a much smaller role in determining # what is actually collected than it used to; collectors now mostly rely on their respective entry # under `last_entries` to determine what should be collected. settings.AUTOMATION_ANALYTICS_LAST_GATHER = until shutil.rmtree(dest, ignore_errors=True) # clean up individual artifact files if not tarfiles: # No data was collected logger.warning("No data from {} to {}".format(since or last_gather, until)) return None return tarfiles
def handle(self, *args, **options): # Load inventory and related objects from database. inventory_name = options.get('inventory_name', None) inventory_id = options.get('inventory_id', None) if inventory_name and inventory_id: raise CommandError('--inventory-name and --inventory-id are mutually exclusive') elif not inventory_name and not inventory_id: raise CommandError('--inventory-name or --inventory-id is required') with advisory_lock('inventory_{}_import'.format(inventory_id)): # Obtain rest of the options needed to run update raw_source = options.get('source', None) if not raw_source: raise CommandError('--source is required') verbosity = int(options.get('verbosity', 1)) self.set_logging_level(verbosity) # Load inventory object based on name or ID. if inventory_id: q = dict(id=inventory_id) else: q = dict(name=inventory_name) try: inventory = Inventory.objects.get(**q) except Inventory.DoesNotExist: raise CommandError('Inventory with %s = %s cannot be found' % list(q.items())[0]) except Inventory.MultipleObjectsReturned: raise CommandError('Inventory with %s = %s returned multiple results' % list(q.items())[0]) logger.info('Updating inventory %d: %s' % (inventory.pk, inventory.name)) # Create ad-hoc inventory source and inventory update objects with ignore_inventory_computed_fields(): source = Command.get_source_absolute_path(raw_source) inventory_source, created = InventorySource.objects.get_or_create( inventory=inventory, source='file', source_path=os.path.abspath(source), overwrite=bool(options.get('overwrite', False)), overwrite_vars=bool(options.get('overwrite_vars', False)), ) inventory_update = inventory_source.create_inventory_update( _eager_fields=dict(job_args=json.dumps(sys.argv), job_env=dict(os.environ.items()), job_cwd=os.getcwd()) ) data = AnsibleInventoryLoader(source=source, verbosity=verbosity).load() logger.debug('Finished loading from source: %s', source) status, tb, exc = 'error', '', None try: self.perform_update(options, data, inventory_update) status = 'successful' except Exception as e: exc = e if isinstance(e, KeyboardInterrupt): status = 'canceled' else: tb = traceback.format_exc() with ignore_inventory_computed_fields(): inventory_update = InventoryUpdate.objects.get(pk=inventory_update.pk) inventory_update.result_traceback = tb inventory_update.status = status inventory_update.save(update_fields=['status', 'result_traceback']) inventory_source.status = status inventory_source.save(update_fields=['status']) if exc: logger.error(str(exc)) if exc: if isinstance(exc, CommandError): sys.exit(1) raise exc