def create_n_watch(context, wf_def, wf_data={}): """ Create a running workflow, and watch it until completion Args: context (ArgoContext): context to execute the workflow in wf_def (dict): workflow definition wf_data (dict, optional): data to be passed to workflow. Defaults to {}. Returns: (ArgoWorkflow, ArgoWorkflowStatus): workflow and status of the workflow """ wf = ArgoWorkflow.create(context, wf_def, wf_data=wf_data) try: wf.watch(context, 10, 18) if wf.last_status.complete: return (wf, wf.last_status) wf.watch(context, 60, 1440) except Exception as exc: logger.debug( "ARGO, ArgoWorkflow.create_n_watch(), while watching {}". format(type(exc))) logger.debug( "ARGO, ArgoWorkflow.create_n_watch(), while watching {}". format(exc)) raise exc return (wf, wf.last_status)
def mount_failed( context, exception_msg, traceback, driverCls, provider, identity, volume_id, unmount=False, **celery_task_args ): from service import volume as volume_service try: celery_logger.debug("mount_failed task started at %s." % timezone.now()) celery_logger.info("task context=%s" % context) err_str = "%s\nMount Error Traceback:%s" % (exception_msg, traceback) celery_logger.error(err_str) driver = get_driver(driverCls, provider, identity) volume = driver.get_volume(volume_id) if unmount: tmp_status = 'umount_error' else: tmp_status = 'mount_error' return volume_service._update_volume_metadata( driver, volume, metadata={'tmp_status': tmp_status} ) except Exception as exc: celery_logger.warn(exc) mount_failed.retry(exc=exc)
def prep_instance_for_snapshot(identity_id, instance_id, **celery_task_args): identity = Identity.objects.get(id=identity_id) try: celery_logger.debug("prep_instance_for_snapshot task started at %s." % timezone.now()) # NOTE: FIXMEIF the assumption that the 'linux username' # is the 'created_by' AtmosphereUser changes. username = identity.created_by.username driver = get_esh_driver(identity) instance = driver.get_instance(instance_id) if instance.extra.get('status','') != 'active': celery_logger.info("prep_instance_for_snapshot skipped") return playbooks = deploy_prepare_snapshot( instance.ip, username, instance_id) celery_logger.info(playbooks.__dict__) hostname = build_host_name(instance.id, instance.ip) result = False if execution_has_failures(playbooks, hostname)\ or execution_has_unreachable(playbooks, hostname) else True if not result: raise Exception( "Error encountered while preparing instance for snapshot: %s" % playbooks.stats.summarize(host=hostname)) except Exception as exc: celery_logger.warn(exc) prep_instance_for_snapshot.retry(exc=exc)
def dump_logs(self, context, log_dir): """ Dump logs of the workflow into the log directory provided. Separate log file for each pods/steps in the workflow, each with the filename of {{pod_name}}.log Args: context (ArgoContext): context used to fetch the logs log_dir (str): directory to dump logs into """ # find out what pods the workflow is consisted of json_resp = context.client().get_workflow(self.wf_name) pod_names = json_resp["status"]["nodes"].keys() # dump logs in separate files for each pods for pod_name in pod_names: filename = "{}.log".format(pod_name) log_file_path = os.path.join(log_dir, filename) with open(log_file_path, "a+") as dump_file: dump_file.write("workflow {} has {} pods\n".format( self.wf_name, len(pod_names))) logs_lines = context.client().get_log_for_pod_in_workflow( self.wf_name, pod_name, container_name="main") dump_file.write("\npod {}:\n".format(pod_name)) dump_file.writelines(logs_lines) logger.debug( ("ARGO, log dump for workflow {}, pod {} at: {}\n").format( self.wf_name, pod_name, log_file_path))
def mount_volume_task( driverCls, provider, identity, instance_id, volume_id, device_location, mount_location, device_type, mount_prefix=None, *args, **kwargs ): try: celery_logger.debug("mount task started at %s." % timezone.now()) celery_logger.debug("mount_location: %s" % (mount_location, )) driver = get_driver(driverCls, provider, identity) username = identity.get_username() instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) try: attach_data = volume.extra['attachments'][0] if not device_location: device_location = attach_data['device'] except (KeyError, IndexError): celery_logger.warn( "Volume %s missing attachments in Extra" % (volume, ) ) if not device_location: raise Exception( "No device_location found or inferred by volume %s" % volume ) if not mount_prefix: mount_prefix = "/vol_" last_char = device_location[-1] # /dev/sdb --> b if not mount_location: mount_location = mount_prefix + last_char playbook_results = deploy_mount_volume( instance.ip, username, instance.id, device_location, mount_location=mount_location, device_type=device_type ) celery_logger.info(playbook_results) if execution_has_failures( playbook_results ) or execution_has_unreachable(playbook_results): raise Exception( "Error encountered while mounting volume: instance_id: {}, volume_id: {}" .format(instance_id, volume_id) ) return mount_location except Exception as exc: celery_logger.warn(exc) mount_volume_task.retry(exc=exc)
def monitor_sizes_for(provider_id, print_logs=False): """ Run the set of tasks related to monitoring sizes for a provider. Optionally, provide a list of usernames to monitor While debugging, print_logs=True can be very helpful. start_date and end_date allow you to search a 'non-standard' window of time. """ from service.driver import get_admin_driver if print_logs: console_handler = _init_stdout_logging() provider = Provider.objects.get(id=provider_id) admin_driver = get_admin_driver(provider) # Non-End dated sizes on this provider db_sizes = Size.objects.filter(only_current(), provider=provider) all_sizes = admin_driver.list_sizes() seen_sizes = [] for cloud_size in all_sizes: core_size = convert_esh_size(cloud_size, provider.uuid) seen_sizes.append(core_size) now_time = timezone.now() needs_end_date = [size for size in db_sizes if size not in seen_sizes] for size in needs_end_date: celery_logger.debug("End dating inactive size: %s" % size) size.end_date = now_time size.save() if print_logs: _exit_stdout_logging(console_handler)
def remove_empty_networks_for(provider_id): provider = Provider.objects.get(id=provider_id) os_driver = get_account_driver(provider) all_instances = os_driver.admin_driver.list_all_instances() project_map = os_driver.network_manager.project_network_map() projects_with_networks = project_map.keys() for project in projects_with_networks: networks = project_map[project]['network'] if not isinstance(networks, list): networks = [networks] for network in networks: network_name = network['name'] celery_logger.debug("Checking if network %s is in use" % network_name) if running_instances(network_name, all_instances): continue # TODO: MUST change when not using 'usergroups' explicitly. user = project try: celery_logger.debug( "Removing project network for User:%s, Project:%s" % (user, project)) os_driver.network_manager.delete_project_network(user, project) except NeutronClientException: celery_logger.exception("Neutron unable to remove project" "network for %s-%s" % (user, project)) except NeutronException: celery_logger.exception("Neutron unable to remove project" "network for %s-%s" % (user, project))
def update_snapshot_cyverse(start_date=None, end_date=None): all_sources = AllocationSource.objects.order_by('name') n = settings.ALLOC_SNAPSHOT_SIZE num_sources = len(all_sources) if num_sources > n: for i in range(0, len(all_sources), n): logger.debug( "Updating {} of {} allocation sources ".format(n, num_sources) ) update_snapshot_cyverse_for.apply_async( args=(all_sources[i:i + n], ), kwargs={ 'start_date': start_date, 'end_date': end_date }, expires=15 * 60 ) else: logger.debug( "Updating all {} allocation sources (snapshot size is {})".format( num_sources, n ) ) update_snapshot_cyverse_for( all_sources, start_date=start_date, end_date=end_date )
def remove_empty_networks_for(provider_id): provider = Provider.objects.get(id=provider_id) os_driver = get_account_driver(provider) all_instances = os_driver.admin_driver.list_all_instances() project_map = os_driver.network_manager.project_network_map() projects_with_networks = project_map.keys() for project in projects_with_networks: networks = project_map[project]['network'] if not isinstance(networks, list): networks = [networks] for network in networks: network_name = network['name'] celery_logger.debug("Checking if network %s is in use" % network_name) if running_instances(network_name, all_instances): continue # TODO: MUST change when not using 'usergroups' explicitly. user = project try: celery_logger.debug("Removing project network for User:%s, Project:%s" % (user, project)) os_driver.network_manager.delete_project_network(user, project) except NeutronClientException: celery_logger.exception("Neutron unable to remove project" "network for %s-%s" % (user, project)) except NeutronException: celery_logger.exception("Neutron unable to remove project" "network for %s-%s" % (user, project))
def update_mount_location(new_mount_location, driverCls, provider, identity, volume_alias): """ """ from service import volume as volume_service try: celery_logger.debug( "update_mount_location task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) volume = driver.get_volume(volume_alias) if not volume: return if not new_mount_location: return volume_metadata = volume.extra['metadata'] return volume_service._update_volume_metadata( driver, volume, metadata={'mount_location': new_mount_location}) celery_logger.debug( "update_mount_location task finished at %s." % datetime.now()) except Exception as exc: celery_logger.exception(exc) update_mount_location.retry(exc=exc)
def monitor_sizes_for(provider_id, print_logs=False): """ Run the set of tasks related to monitoring sizes for a provider. Optionally, provide a list of usernames to monitor While debugging, print_logs=True can be very helpful. start_date and end_date allow you to search a 'non-standard' window of time. """ from service.driver import get_admin_driver if print_logs: import logging import sys consolehandler = logging.StreamHandler(sys.stdout) consolehandler.setLevel(logging.DEBUG) celery_logger.addHandler(consolehandler) provider = Provider.objects.get(id=provider_id) admin_driver = get_admin_driver(provider) # Non-End dated sizes on this provider db_sizes = Size.objects.filter(only_current(), provider=provider) all_sizes = admin_driver.list_sizes() seen_sizes = [] for cloud_size in all_sizes: core_size = convert_esh_size(cloud_size, provider.uuid) seen_sizes.append(core_size) now_time = timezone.now() needs_end_date = [size for size in db_sizes if size not in seen_sizes] for size in needs_end_date: celery_logger.debug("End dating inactive size: %s" % size) size.end_date = now_time size.save() if print_logs: celery_logger.removeHandler(consolehandler)
def prep_instance_for_snapshot(identity_id, instance_id, **celery_task_args): identity = Identity.objects.get(id=identity_id) try: celery_logger.debug("prep_instance_for_snapshot task started at %s." % timezone.now()) # NOTE: FIXMEIF the assumption that the 'linux username' # is the 'created_by' AtmosphereUser changes. username = identity.created_by.username driver = get_esh_driver(identity) instance = driver.get_instance(instance_id) if instance.extra.get('status', '') != 'active': celery_logger.info("prep_instance_for_snapshot skipped") return playbooks = deploy_prepare_snapshot(instance.ip, username, instance_id) celery_logger.info(playbooks.__dict__) hostname = build_host_name(instance.id, instance.ip) result = False if execution_has_failures(playbooks, hostname)\ or execution_has_unreachable(playbooks, hostname) else True if not result: raise Exception( "Error encountered while preparing instance for snapshot: %s" % playbooks.stats.summarize(host=hostname)) except Exception as exc: celery_logger.warn(exc) prep_instance_for_snapshot.retry(exc=exc)
def monitor_allocation_sources(usernames=()): """ Monitor allocation sources, if a snapshot shows that all compute has been used, then enforce as necessary """ celery_logger.debug('monitor_allocation_sources - usernames: %s', usernames) allocation_sources = AllocationSource.objects.all() for allocation_source in allocation_sources.order_by('name'): celery_logger.debug( 'monitor_allocation_sources - allocation_source: %s', allocation_source) for user in allocation_source.all_users.order_by('username'): celery_logger.debug('monitor_allocation_sources - user: %s', user) if usernames and user.username not in usernames: celery_logger.info("Skipping User %s - not in the list" % user.username) continue over_allocation = allocation_source.is_over_allocation(user) celery_logger.debug( 'monitor_allocation_sources - user: %s, over_allocation: %s', user, over_allocation) if not over_allocation: continue celery_logger.debug( 'monitor_allocation_sources - Going to enforce on user user: %s', user) allocation_source_overage_enforcement_for_user.apply_async( args=(allocation_source, user))
def mount_failed(task_uuid, driverCls, provider, identity, volume_id, unmount=False, **celery_task_args): from service import volume as volume_service try: celery_logger.debug("mount_failed task started at %s." % datetime.now()) celery_logger.info("task_uuid=%s" % task_uuid) result = app.AsyncResult(task_uuid) with allow_join_result(): exc = result.get(propagate=False) err_str = "Mount Error Traceback:%s" % (result.traceback, ) celery_logger.error(err_str) driver = get_driver(driverCls, provider, identity) volume = driver.get_volume(volume_id) if unmount: tmp_status = 'umount_error' else: tmp_status = 'mount_error' return volume_service._update_volume_metadata( driver, volume, metadata={'tmp_status': tmp_status}) celery_logger.debug("mount_failed task finished at %s." % datetime.now()) except Exception as exc: celery_logger.warn(exc) mount_failed.retry(exc=exc)
def umount_task(driverCls, provider, identity, instance_id, volume_id, *args, **kwargs): try: celery_logger.debug("umount_task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) attach_data = volume.extra["attachments"][0] device = attach_data["device"] # Check mount to find the mount_location for device private_key = "/opt/dev/atmosphere/extras/ssh/id_rsa" kwargs.update({"ssh_key": private_key}) kwargs.update({"timeout": 120}) mount_location = None cm_script = check_mount() kwargs.update({"deploy": cm_script}) driver.deploy_to(instance, **kwargs) regex = re.compile("(?P<device>[\w/]+) on (?P<location>.*) type") for line in cm_script.stdout.split("\n"): res = regex.search(line) if not res: continue search_dict = res.groupdict() dev_found = search_dict["device"] if device == dev_found: mount_location = search_dict["location"] break # Volume not mounted, move along.. if not mount_location: return um_script = umount_volume(device) kwargs.update({"deploy": um_script}) driver.deploy_to(instance, **kwargs) if "device is busy" in um_script.stdout: # Show all processes that are making device busy.. lsof_script = lsof_location(mount_location) kwargs.update({"deploy": lsof_script}) driver.deploy_to(instance, **kwargs) regex = re.compile("(?P<name>[\w]+)\s*(?P<pid>[\d]+)") offending_processes = [] for line in lsof_script.stdout.split("\n"): res = regex.search(line) if not res: continue search_dict = res.groupdict() offending_processes.append((search_dict["name"], search_dict["pid"])) raise DeviceBusyException(mount_location, offending_processes) # Return here if no errors occurred.. celery_logger.debug("umount_task finished at %s." % datetime.now()) except DeviceBusyException: raise except Exception as exc: celery_logger.warn(exc) umount_task.retry(exc=exc)
def unmount_volume_task(driverCls, provider, identity, instance_id, volume_id, *args, **kwargs): try: celery_logger.debug("unmount task started at %s." % timezone.now()) driver = get_driver(driverCls, provider, identity) username = identity.get_username() instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) device_location = None try: attach_data = volume.extra['attachments'][0] device_location = attach_data['device'] except (KeyError, IndexError): celery_logger.warn("Volume %s missing attachments in Extra" % (volume, )) if not device_location: raise Exception( "No device_location found or inferred by volume %s" % volume) try: playbooks = deploy_unmount_volume(instance.ip, username, instance.id, device_location) except DeviceBusyException: # Future-Fixme: Update VolumeStatusHistory.extra, set status to 'unmount_failed' raise hostname = build_host_name(instance.id, instance.ip) result = False if execution_has_failures(playbooks, hostname)\ or execution_has_unreachable(playbooks, hostname) else True if not result: raise Exception("Error encountered while unmounting volume: %s" % playbooks.stats.summarize(host=hostname)) return device_location except Exception as exc: celery_logger.warn(exc) unmount_volume_task.retry(exc=exc)
def update_snapshot_cyverse(start_date=None, end_date=None): logger.debug("update_snapshot_cyverse task started at %s." % datetime.now()) end_date = timezone.now().replace( microsecond=0) if not end_date else end_date for allocation_source in AllocationSource.objects.order_by('name'): # calculate and save snapshots here allocation_source_name = allocation_source.name last_renewal_event = EventTable.objects.filter( name='allocation_source_created_or_renewed', payload__allocation_source_name__exact=str( allocation_source_name)).order_by('timestamp') if not last_renewal_event: logger.info('Allocation Source %s Create/Renewal event missing', allocation_source_name) continue start_date = last_renewal_event.last().timestamp.replace( microsecond=0) if not start_date else start_date total_compute_used = 0 total_burn_rate = 0 for user in allocation_source.all_users: compute_used, burn_rate = total_usage( user.username, start_date=start_date, end_date=end_date, allocation_source_name=allocation_source_name, burn_rate=True) UserAllocationSnapshot.objects.update_or_create( allocation_source=allocation_source, user=user, defaults={ 'compute_used': compute_used, 'burn_rate': burn_rate }) total_compute_used += compute_used total_burn_rate += burn_rate AllocationSourceSnapshot.objects.update_or_create( allocation_source=allocation_source, defaults={ 'compute_used': total_compute_used, 'global_burn_rate': total_burn_rate }) run_all( rule_list=cyverse_rules, defined_variables=CyverseTestRenewalVariables( allocation_source, end_date, start_date), defined_actions=CyverseTestRenewalActions(allocation_source, end_date), ) # At the end of the task, fire-off an allocation threshold check logger.debug("update_snapshot_cyverse task finished at %s." % datetime.now()) allocation_threshold_check.apply_async()
def add_membership_task(image_version, group): celery_logger.debug("add_membership_task task started at %s." % timezone.now()) try: add_membership(image_version, group) celery_logger.debug("add_membership_task task finished at %s." % timezone.now()) except Exception as exc: celery_logger.exception(exc) add_membership_task.retry(exc=exc)
def monitor_instances_for(provider_id, users=None, print_logs=False, check_allocations=False, start_date=None, end_date=None): """ Run the set of tasks related to monitoring instances for a provider. Optionally, provide a list of usernames to monitor While debugging, print_logs=True can be very helpful. start_date and end_date allow you to search a 'non-standard' window of time. """ provider = Provider.objects.get(id=provider_id) # For now, lets just ignore everything that isn't openstack. if 'openstack' not in provider.type.name.lower(): return instance_map = _get_instance_owner_map(provider, users=users) if print_logs: console_handler = _init_stdout_logging() # DEVNOTE: Potential slowdown running multiple functions # Break this out when instance-caching is enabled running_total = 0 if not settings.ENFORCING: celery_logger.debug('Settings dictate allocations are NOT enforced') for username in sorted(instance_map.keys()): running_instances = instance_map[username] running_total += len(running_instances) identity = _get_identity_from_tenant_name(provider, username) if identity and running_instances: try: driver = get_cached_driver(identity=identity) core_running_instances = [ convert_esh_instance( driver, inst, identity.provider.uuid, identity.uuid, identity.created_by) for inst in running_instances] except Exception as exc: celery_logger.exception( "Could not convert running instances for %s" % username) continue else: # No running instances. core_running_instances = [] # Using the 'known' list of running instances, cleanup the DB core_instances = _cleanup_missing_instances( identity, core_running_instances) if check_allocations: allocation_result = user_over_allocation_enforcement( provider, username, print_logs, start_date, end_date) if print_logs: _exit_stdout_logging(console_handler) return running_total
def running_instances(network_name, all_instances): for instance in all_instances: if network_name in instance.extra['addresses'].keys(): # #If not build/active, the network is assumed to be NOT in use celery_logger.debug("Network %s is in use, Active Instance:%s" % (network_name, instance.id)) return True celery_logger.debug("Network %s is NOT in use" % network_name) return False
def monitor_instances_for(provider_id, users=None, print_logs=False, check_allocations=False, start_date=None, end_date=None): """ Run the set of tasks related to monitoring instances for a provider. Optionally, provide a list of usernames to monitor While debugging, print_logs=True can be very helpful. start_date and end_date allow you to search a 'non-standard' window of time. """ provider = Provider.objects.get(id=provider_id) # For now, lets just ignore everything that isn't openstack. if 'openstack' not in provider.type.name.lower(): return instance_map = _get_instance_owner_map(provider, users=users) if print_logs: console_handler = _init_stdout_logging() # DEVNOTE: Potential slowdown running multiple functions # Break this out when instance-caching is enabled running_total = 0 if not settings.ENFORCING: celery_logger.debug('Settings dictate allocations are NOT enforced') for username in sorted(instance_map.keys()): running_instances = instance_map[username] running_total += len(running_instances) identity = _get_identity_from_tenant_name(provider, username) if identity and running_instances: try: driver = get_cached_driver(identity=identity) core_running_instances = [ convert_esh_instance(driver, inst, identity.provider.uuid, identity.uuid, identity.created_by) for inst in running_instances ] except Exception as exc: celery_logger.exception( "Could not convert running instances for %s" % username) continue else: # No running instances. core_running_instances = [] # Using the 'known' list of running instances, cleanup the DB core_instances = _cleanup_missing_instances(identity, core_running_instances) if check_allocations: allocation_result = user_over_allocation_enforcement( provider, username, print_logs, start_date, end_date) if print_logs: _exit_stdout_logging(console_handler) return running_total
def monitor_sizes_for(provider_id, print_logs=False): """ Run the set of tasks related to monitoring sizes for a provider. Optionally, provide a list of usernames to monitor While debugging, print_logs=True can be very helpful. start_date and end_date allow you to search a 'non-standard' window of time. """ from service.driver import get_admin_driver if print_logs: console_handler = _init_stdout_logging() provider = Provider.objects.get(id=provider_id) admin_driver = get_admin_driver(provider) # Non-End dated sizes on this provider db_sizes = Size.objects.filter(only_current(), provider=provider) all_sizes = admin_driver.list_sizes() seen_sizes = [] for cloud_size in all_sizes: core_size = convert_esh_size(cloud_size, provider.uuid) seen_sizes.append(core_size) now_time = timezone.now() needs_end_date = [size for size in db_sizes if size not in seen_sizes] for size in needs_end_date: celery_logger.debug("End dating inactive size: %s" % size) size.end_date = now_time size.save() # Find home for 'Unknown Size' unknown_sizes = Size.objects.filter( provider=provider, name__contains='Unknown Size' ) for size in unknown_sizes: # Lookup sizes may not show up in 'list_sizes' if size.alias == 'N/A': continue # This is a sentinal value added for a separate purpose. try: libcloud_size = admin_driver.get_size( size.alias, forced_lookup=True ) except BaseHTTPError as error: if error.code == 404: # The size may have been truly deleted continue if not libcloud_size: continue cloud_size = OSSize(libcloud_size) core_size = convert_esh_size(cloud_size, provider.uuid) if print_logs: _exit_stdout_logging(console_handler) for size in seen_sizes: size.esh = None return seen_sizes
def monitor_volumes_for(provider_id, print_logs=False): """ Run the set of tasks related to monitoring sizes for a provider. Optionally, provide a list of usernames to monitor While debugging, print_logs=True can be very helpful. start_date and end_date allow you to search a 'non-standard' window of time. """ from service.driver import get_account_driver from core.models import Identity if print_logs: console_handler = _init_stdout_logging() provider = Provider.objects.get(id=provider_id) account_driver = get_account_driver(provider) # Non-End dated volumes on this provider db_volumes = Volume.objects.filter(only_current_source(), instance_source__provider=provider) all_volumes = account_driver.admin_driver.list_all_volumes(timeout=30) seen_volumes = [] for cloud_volume in all_volumes: try: core_volume = convert_esh_volume(cloud_volume, provider_uuid=provider.uuid) seen_volumes.append(core_volume) except ObjectDoesNotExist: tenant_id = cloud_volume.extra['object']['os-vol-tenant-attr:tenant_id'] tenant = account_driver.get_project_by_id(tenant_id) tenant_name = tenant.name if tenant else tenant_id try: if not tenant: celery_logger.warn("Warning: tenant_id %s found on volume %s, but did not exist from the account driver perspective.", tenant_id, cloud_volume) raise ObjectDoesNotExist() identity = Identity.objects.filter( contains_credential('ex_project_name', tenant_name), provider=provider ).first() if not identity: raise ObjectDoesNotExist() core_volume = convert_esh_volume( cloud_volume, provider.uuid, identity.uuid, identity.created_by) except ObjectDoesNotExist: celery_logger.info("Skipping Volume %s - No Identity for: Provider:%s + Project Name:%s" % (cloud_volume.id, provider, tenant_name)) pass now_time = timezone.now() needs_end_date = [volume for volume in db_volumes if volume not in seen_volumes] for volume in needs_end_date: celery_logger.debug("End dating inactive volume: %s" % volume) volume.end_date = now_time volume.save() if print_logs: _exit_stdout_logging(console_handler) for vol in seen_volumes: vol.esh = None return [vol.instance_source.identifier for vol in seen_volumes]
def monitor_instances_for( provider_id, users=None, print_logs=False, start_date=None, end_date=None ): """ Run the set of tasks related to monitoring instances for a provider. Optionally, provide a list of usernames to monitor While debugging, print_logs=True can be very helpful. start_date and end_date allow you to search a 'non-standard' window of time. """ provider = Provider.objects.get(id=provider_id) # For now, lets just ignore everything that isn't openstack. if 'openstack' not in provider.type.name.lower(): return instance_map = _get_instance_owner_map(provider, users=users) if print_logs: console_handler = _init_stdout_logging() seen_instances = [] # DEVNOTE: Potential slowdown running multiple functions # Break this out when instance-caching is enabled if not settings.ENFORCING: celery_logger.debug('Settings dictate allocations are NOT enforced') for tenant_name in sorted(instance_map.keys()): running_instances = instance_map[tenant_name] identity = _get_identity_from_tenant_name(provider, tenant_name) if identity and running_instances: try: driver = get_cached_driver(identity=identity) core_running_instances = [ convert_esh_instance( driver, inst, identity.provider.uuid, identity.uuid, identity.created_by ) for inst in running_instances ] seen_instances.extend(core_running_instances) except Exception: celery_logger.exception( "Could not convert running instances for %s" % tenant_name ) continue else: # No running instances. core_running_instances = [] # Using the 'known' list of running instances, cleanup the DB _cleanup_missing_instances(identity, core_running_instances) if print_logs: _exit_stdout_logging(console_handler) # return seen_instances NOTE: this has been commented out to avoid PicklingError! # TODO: Uncomment the above, Determine what _we can return_ and return that instead.... return
def allocation_source_overage_enforcement_for_user(allocation_source, user): celery_logger.debug('allocation_source_overage_enforcement_for_user - allocation_source: %s, user: %s', allocation_source, user) user_instances = [] for identity in user.current_identities: try: celery_logger.debug('allocation_source_overage_enforcement_for_user - identity: %s', identity) affected_instances = allocation_source_overage_enforcement_for(allocation_source, user, identity) user_instances.extend(affected_instances) except Exception: celery_logger.exception( 'allocation_source_overage_enforcement_for allocation_source: %s, user: %s, and identity: %s', allocation_source, user, identity) return user_instances
def update_volume_metadata(driverCls, provider, identity, volume_alias, metadata): """ """ from service import volume as volume_service try: celery_logger.debug("update_volume_metadata task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) volume = driver.get_volume(volume_alias) if not volume: return return volume_service.update_volume_metadata(driver, volume, metadata=metadata) celery_logger.debug("volume_metadata task finished at %s." % datetime.now()) except Exception as exc: celery_logger.exception(exc) update_volume_metadata.retry(exc=exc)
def remove_empty_networks_for(provider_id): provider = Provider.objects.get(id=provider_id) os_driver = get_account_driver(provider) if not os_driver: celery_logger.warn( "Cannot remove_empty_networks_for provider %s -- Account Driver not created" % provider) return all_instances = os_driver.admin_driver.list_all_instances() project_map = os_driver.network_manager.project_network_map() known_project_names = Credential.objects.filter( key='ex_project_name').values_list('value', flat=True) projects_with_networks = sorted( [k for k in project_map.keys() if k in known_project_names]) for project in projects_with_networks: networks = project_map[project]['network'] if not isinstance(networks, list): networks = [networks] for network in networks: network_name = network['name'] celery_logger.debug("Checking if network %s is in use" % network_name) if running_instances(network_name, all_instances): continue user = project identity = Identity.objects.filter( provider_id=provider_id, credential__key='ex_project_name', credential__value=project).filter( credential__key='key', credential__value=user).first() if not identity: celery_logger.warn( "NOT Removing project network for User:%s, Project:%s -- No Valid Identity found!" % (user, project)) continue try: celery_logger.debug( "Removing project network for User:%s, Project:%s" % (user, project)) os_driver.delete_user_network(identity) except NeutronClientException: celery_logger.exception("Neutron unable to remove project" "network for %s-%s" % (user, project)) except NeutronException: celery_logger.exception("Neutron unable to remove project" "network for %s-%s" % (user, project))
def check_volume_task(driverCls, provider, identity, instance_id, volume_id, *args, **kwargs): try: celery_logger.debug("check_volume task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) attach_data = volume.extra['attachments'][0] device = attach_data['device'] private_key = ATMOSPHERE_PRIVATE_KEYFILE kwargs.update({'ssh_key': private_key}) kwargs.update({'timeout': 120}) # One script to make two checks: # 1. Voume exists 2. Volume has a filesystem cv_script = check_volume(device) # NOTE: non_zero_deploy needed to stop LibcloudDeploymentError from being # raised kwargs.update({'deploy': cv_script, 'non_zero_deploy': True}) driver.deploy_to(instance, **kwargs) kwargs.pop('non_zero_deploy', None) # Script execute if cv_script.exit_status != 0: if 'No such file' in cv_script.stdout: raise Exception('Volume check failed: %s. ' 'Device %s does not exist on instance %s' % (volume, device, instance)) elif 'Bad magic number' in cv_script.stdout: # Filesystem needs to be created for this device celery_logger.info("Mkfs needed") mkfs_script = mkfs_volume(device) kwargs.update({'deploy': mkfs_script}) driver.deploy_to(instance, **kwargs) else: raise Exception('Volume check failed: Something weird') celery_logger.debug("check_volume task finished at %s." % datetime.now()) except LibcloudDeploymentError as exc: celery_logger.exception(exc) except Exception as exc: celery_logger.warn(exc) check_volume_task.retry(exc=exc)
def check_volume_task( driverCls, provider, identity, instance_id, volume_id, device_type='ext4', *args, **kwargs ): try: celery_logger.debug("check_volume task started at %s." % timezone.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) username = identity.get_username() attach_data = volume.extra['attachments'][0] device_location = attach_data['device'] celery_logger.info("device_location: %s" % device_location) # One playbook to make two checks: # 1. Voume exists # 2. Volume has a filesystem # (If not, create one of type 'device_type') playbook_results = deploy_check_volume( instance.ip, username, instance.id, device_location, device_type=device_type ) success = not ( execution_has_failures(playbook_results) or execution_has_unreachable(playbook_results) ) if not success: raise Exception( "Error encountered while checking volume for filesystem: instance_id: {}, volume_id: {}" .format(instance_id, volume_id) ) return success except Exception as exc: celery_logger.warn(exc) check_volume_task.retry(exc=exc)
def dump_pod_logs(self, context, pod_name, log_file_path): """ Dump logs of a pod in the workflow into a log file at the given path. Technically, it is node_name, calling it the method dump_pod_logs & argument pod_name is just to conform to the name in the url in swagger doc. Args: context (ArgoContext): context used to fetch the logs pod_name (str): name of the pod log_file_path (str): path to the log file """ # find out what pods the workflow is consisted of with open(log_file_path, "a+") as log_file: logs_lines = context.client().get_log_for_pod_in_workflow( self.wf_name, pod_name, container_name="main") log_file.write("\n".join(logs_lines)) logger.debug( ("ARGO, log dump for workflow {}, pod {} at: {}\n").format( self.wf_name, pod_name, log_file_path))
def check_volume_task(driverCls, provider, identity, instance_id, volume_id, device_type='ext4', *args, **kwargs): try: celery_logger.debug("check_volume task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) username = identity.get_username() attach_data = volume.extra['attachments'][0] device_location = attach_data['device'] celery_logger.info("device_location: %s" % device_location) # One playbook to make two checks: # 1. Voume exists # 2. Volume has a filesystem # (If not, create one of type 'device_type') playbooks = deploy_check_volume(instance.ip, username, instance.id, device_location, device_type=device_type) celery_logger.info(playbooks.__dict__) hostname = build_host_name(instance.id, instance.ip) result = False if execution_has_failures(playbooks, hostname)\ or execution_has_unreachable(playbooks, hostname) else True if not result: raise Exception( "Error encountered while checking volume for filesystem: %s" % playbooks.stats.summarize(host=hostname)) return result except LibcloudDeploymentError as exc: celery_logger.exception(exc) except Exception as exc: celery_logger.warn(exc) check_volume_task.retry(exc=exc)
def update_volume_metadata( driverCls, provider, identity, volume_alias, metadata ): """ """ from service import volume as volume_service try: celery_logger.debug( "update_volume_metadata task started at %s." % timezone.now() ) driver = get_driver(driverCls, provider, identity) volume = driver.get_volume(volume_alias) if not volume: return return volume_service._update_volume_metadata( driver, volume, metadata=metadata ) except Exception as exc: celery_logger.exception(exc) update_volume_metadata.retry(exc=exc)
def create(context, wf_def, wf_data={}, lint=False): """ Create a running workflow Args: context (ArgoContext): context to execute the workflow in wf_def (dict): workflow definition wf_data (dict, optional): workflow data to be pass along. Defaults to {}. lint (bool, optional): Whether to submit workflow definition for linting first. Defaults to False. Returns: ArgoWorkflow: ArgoWorkflow object created based on the returned json """ if wf_data: wf_def = _populate_wf_data(wf_def, wf_data) json_resp = context.client().run_workflow(wf_def) wf_name = json_resp["metadata"]["name"] logger.debug("ARGO, workflow {} created".format(wf_name)) return ArgoWorkflow(wf_name)
def unmount_volume_task( driverCls, provider, identity, instance_id, volume_id, *args, **kwargs ): try: celery_logger.debug("unmount task started at %s." % timezone.now()) driver = get_driver(driverCls, provider, identity) username = identity.get_username() instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) device_location = None try: attach_data = volume.extra['attachments'][0] device_location = attach_data['device'] except (KeyError, IndexError): celery_logger.warn( "Volume %s missing attachments in Extra" % (volume, ) ) if not device_location: raise Exception( "No device_location found or inferred by volume %s" % volume ) try: playbook_results = deploy_unmount_volume( instance.ip, username, instance.id, device_location ) except DeviceBusyException: # Future-Fixme: Update VolumeStatusHistory.extra, set status to 'unmount_failed' raise if execution_has_failures( playbook_results ) or execution_has_unreachable(playbook_results): raise Exception( "Error encountered while unmounting volume: instance_id: {}, volume_id: {}" .format(instance_id, volume_id) ) return device_location except Exception as exc: celery_logger.warn(exc) unmount_volume_task.retry(exc=exc)
def allocation_threshold_check(): logger.debug("allocation_threshold_check task started at %s." % datetime.now()) if not settings.CHECK_THRESHOLD: logger.debug("CHECK_THRESHOLD is FALSE -- allocation_threshold_check task finished at %s." % datetime.now()) return for allocation_source in AllocationSource.objects.filter(compute_allowed__gte=0).all(): snapshot = allocation_source.snapshot percentage_used = (snapshot.compute_used / snapshot.compute_allowed) * 100 # check if percentage more than threshold THRESHOLD = [50.0, 90.0] for threshold in THRESHOLD: if percentage_used > threshold: compute_used = snapshot.compute_used allocation_source_name = allocation_source.name # check if event has been fired prev_event = EventTable.objects.filter(name='allocation_source_threshold_met', payload__allocation_source_name=allocation_source_name, payload__threshold=threshold).last() if prev_event: continue payload = {} payload['allocation_source_name'] = allocation_source_name payload['threshold'] = threshold payload['usage_percentage'] = float(percentage_used) EventTable.objects.create( name='allocation_source_threshold_met', payload=payload, entity_id=payload['allocation_source_name']) break logger.debug("allocation_threshold_check task finished at %s." % datetime.now())
def update_snapshot_cyverse(start_date=None, end_date=None): logger.debug("update_snapshot_cyverse task started at %s." % datetime.now()) end_date = timezone.now().replace(microsecond=0) if not end_date else end_date for allocation_source in AllocationSource.objects.order_by('name'): # calculate and save snapshots here allocation_source_name = allocation_source.name last_renewal_event = EventTable.objects.filter( name='allocation_source_created_or_renewed', payload__allocation_source_name__exact=str(allocation_source_name)).order_by('timestamp') if not last_renewal_event: logger.info('Allocation Source %s Create/Renewal event missing', allocation_source_name) continue start_date = last_renewal_event.last().timestamp.replace(microsecond=0) if not start_date else start_date total_compute_used = 0 total_burn_rate = 0 for user in allocation_source.all_users: compute_used, burn_rate = total_usage(user.username, start_date=start_date, end_date=end_date, allocation_source_name=allocation_source_name, burn_rate=True) UserAllocationSnapshot.objects.update_or_create(allocation_source=allocation_source, user=user, defaults={'compute_used': compute_used, 'burn_rate': burn_rate}) total_compute_used += compute_used total_burn_rate += burn_rate AllocationSourceSnapshot.objects.update_or_create(allocation_source=allocation_source, defaults={'compute_used': total_compute_used, 'global_burn_rate': total_burn_rate}) run_all(rule_list=cyverse_rules, defined_variables=CyverseTestRenewalVariables(allocation_source, current_time=end_date, last_renewal_event_date=start_date), defined_actions=CyverseTestRenewalActions(allocation_source, current_time=end_date)) # At the end of the task, fire-off an allocation threshold check logger.debug("update_snapshot_cyverse task finished at %s." % datetime.now()) allocation_threshold_check.apply_async()
def mount_failed(task_uuid, driverCls, provider, identity, volume_id, unmount=False, **celery_task_args): from service import volume as volume_service try: celery_logger.debug("mount_failed task started at %s." % datetime.now()) celery_logger.info("task_uuid=%s" % task_uuid) result = app.AsyncResult(task_uuid) with allow_join_result(): exc = result.get(propagate=False) err_str = "Mount Error Traceback:%s" % (result.traceback,) celery_logger.error(err_str) driver = get_driver(driverCls, provider, identity) volume = driver.get_volume(volume_id) if unmount: tmp_status = "umount_error" else: tmp_status = "mount_error" return volume_service.update_volume_metadata(driver, volume, metadata={"tmp_status": tmp_status}) celery_logger.debug("mount_failed task finished at %s." % datetime.now()) except Exception as exc: celery_logger.warn(exc) mount_failed.retry(exc=exc)
def _dump_deploy_logs(wf, username, instance_uuid): """ Dump workflow logs locally Args: wf (ArgoWorkflow): workflow to dump logs of username (str): username of owner of the instance instance_uuid (str): uuid of the instance """ try: context = argo_context_from_config(settings.ARGO_CONFIG_FILE_PATH) timestamp = time.strftime("%Y-%m-%d_%H%M%S", time.localtime()) log_dir = _create_deploy_log_dir(username, instance_uuid, timestamp) # fetch all info about pods in workflow nodes = wf.get_nodes(context) for node_name, node in nodes.items(): playbook_name = None # try finding playbook filename from parameters if "inputs" in node and "parameters" in node["inputs"]: for param in node["inputs"]["parameters"]: if param["name"] == "playbook": playbook_name = os.path.basename(param["value"]) break if playbook_name: log_filename = os.path.join(log_dir, playbook_name + ".log") else: # uses node name if playbook filename is not found log_filename = os.path.join(log_dir, node_name + ".log") wf.dump_pod_logs(context, node_name, log_filename) except Exception as exc: celery_logger.debug( "ARGO, failed to dump logs for workflow {}, {}".format( wf.wf_name, type(exc))) celery_logger.debug(exc)
def attach_task(driverCls, provider, identity, instance_id, volume_id, device_choice=None, *args, **kwargs): try: celery_logger.debug("attach_task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) # Step 1. Attach the volume # NOTE: device_choice !== device 100% driver.attach_volume(instance, volume, device_choice) # When the reslt returns the volume will be 'attaching' # We can't do anything until the volume is 'available/in-use' attempts = 0 while True: volume = driver.get_volume(volume_id) # Give up if you can't find the volume if not volume: return None if attempts > 6: # After 6 attempts (~1min) break # Openstack Check if isinstance(driver, OSDriver) and\ 'attaching' not in volume.extra.get('status', ''): break if isinstance(driver, EucaDriver) and\ 'attaching' not in volume.extra.get('status', ''): break # Exponential backoff.. attempts += 1 sleep_time = 2**attempts celery_logger.debug("Volume %s is not ready (%s). Sleep for %s" % (volume.id, volume.extra.get('status', 'no-status'), sleep_time)) time.sleep(sleep_time) if 'available' in volume.extra.get('status', ''): raise Exception("Volume %s failed to attach to instance %s" % (volume.id, instance.id)) # Device path for euca == openstack try: attach_data = volume.extra['attachments'][0] device = attach_data['device'] except (IndexError, KeyError) as bad_fetch: celery_logger.warn("Could not find 'device' in " "volume.extra['attachments']: " "Volume:%s Extra:%s" % (volume.id, volume.extra)) device = None celery_logger.debug("attach_task finished at %s." % datetime.now()) return device except Exception as exc: celery_logger.warn(exc) attach_task.retry(exc=exc)
def detach_task( driverCls, provider, identity, instance_id, volume_id, *args, **kwargs ): try: celery_logger.debug("detach_task started at %s." % timezone.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) driver.detach_volume(volume) # When the reslt returns the volume will be 'detaching' # We will ensure the volume does not return to 'in-use' attempts = 0 while True: volume = driver.get_volume(volume_id) if attempts > 6: # After 6 attempts (~1min) break # The Openstack way if isinstance(driver, OSDriver)\ and 'detaching' not in volume.extra['status']: break # The Eucalyptus way attach_data = volume.extra['attachments'][0] if isinstance(driver, EucaDriver) and attach_data\ and 'detaching' not in attach_data.get('status'): break # Exponential backoff.. attempts += 1 sleep_time = 2**attempts celery_logger.debug( "Volume %s is not ready (%s). Sleep for %s" % (volume.id, volume.extra['status'], sleep_time) ) time.sleep(sleep_time) if 'in-use' in volume.extra['status']: raise Exception( "Failed to detach Volume %s to instance %s" % (volume, instance) ) celery_logger.debug("detach_task finished at %s." % timezone.now()) except DeviceBusyException: # We should NOT retry if the device is busy raise except Exception as exc: # If the volume is NOT attached, do not retry. if 'Volume is not attached' in exc.message: return celery_logger.exception(exc) detach_task.retry(exc=exc)
def attach_task( driverCls, provider, identity, instance_id, volume_id, device_choice=None, *args, **kwargs ): celery_logger.debug("attach_task started at %s." % timezone.now()) driver = get_driver(driverCls, provider, identity) from service.volume import attach_volume attach_volume(driver, instance_id, volume_id, device_choice=device_choice) attempts = 0 while True: volume = driver.get_volume(volume_id) assert volume, "Volume ({}) does not exist".format(volume_id) volume_status = volume.extra.get('status', '') if volume_status == "in-use": break if attempts > 4: raise Exception( "Attach task timed out for volume {} and instance {}, volume status: {}" .format(volume_id, instance_id, volume_status) ) celery_logger.debug( "Volume {} is not ready. Expected 'in-use', got '{}'".format( volume_id, volume_status ) ) time.sleep(10) attempts += 1 try: attach_data = volume.extra['attachments'][0] device = attach_data['device'] except (IndexError, KeyError): raise Exception( "Could not find 'device' in volume.extra {}".format(volume.extra) ) celery_logger.debug("attach_task finished at %s." % timezone.now()) return device
def remove_empty_networks(): celery_logger.debug("remove_empty_networks task started at %s." % datetime.now()) for provider in Provider.get_active(type_name='openstack'): remove_empty_networks_for.apply_async(args=[provider.id])
def mount_task(driverCls, provider, identity, instance_id, volume_id, device=None, mount_location=None, *args, **kwargs): try: celery_logger.debug("mount task started at %s." % datetime.now()) celery_logger.debug("mount_location: %s" % (mount_location, )) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) username = identity.get_username() # DEV NOTE: Set as 'users' because this is a GUARANTEED group # and we know our 'user' will exist (if atmo_init_full was executed) # in case the VM does NOT rely on iPlant LDAP groupname = "users" celery_logger.debug(volume) try: attach_data = volume.extra['attachments'][0] if not device: device = attach_data['device'] except KeyError as IndexError: celery_logger.warn("Volume %s missing attachments in Extra" % (volume,)) device = None if not device: celery_logger.warn("Device never attached. Nothing to mount") return None private_key = "/opt/dev/atmosphere/extras/ssh/id_rsa" kwargs.update({'ssh_key': private_key}) kwargs.update({'timeout': 120}) # Step 2. Check the volume is not already mounted cm_script = check_mount() kwargs.update({'deploy': cm_script}) driver.deploy_to(instance, **kwargs) if device in cm_script.stdout: mount_location = _parse_mount_location(cm_script.stdout, device) if not mount_location: raise Exception("Device already mounted, " "but mount location could not be determined!" "Check _parse_mount_location()!") celery_logger.warn( "Device already mounted. Mount output:%s" % cm_script.stdout) # Device has already been mounted. Move along.. return mount_location # Step 3. Find a suitable location to mount the volume celery_logger.info("Original mount location - %s" % mount_location) if not mount_location: inc = 1 while True: if '/vol%s' % inc in cm_script.stdout: inc += 1 else: break mount_location = '/vol%s' % inc celery_logger.info("Device location - %s" % device) celery_logger.info("New mount location - %s" % mount_location) mv_script = mount_volume(device, mount_location, username, groupname) kwargs.update({'deploy': mv_script}) driver.deploy_to(instance, **kwargs) celery_logger.debug("mount task finished at %s." % datetime.now()) return mount_location except Exception as exc: celery_logger.warn(exc) mount_task.retry(exc=exc)
def monitor_allocation_sources(usernames=()): """ Monitor allocation sources, if a snapshot shows that all compute has been used, then enforce as necessary """ celery_logger.debug('monitor_allocation_sources - usernames: %s', usernames) allocation_sources = AllocationSource.objects.all() for allocation_source in allocation_sources.order_by('name'): celery_logger.debug( 'monitor_allocation_sources - allocation_source: %s', allocation_source ) for user in allocation_source.all_users.order_by('username'): celery_logger.debug('monitor_allocation_sources - user: %s', user) if usernames and user.username not in usernames: celery_logger.info( "Skipping User %s - not in the list" % user.username ) continue over_allocation = allocation_source.is_over_allocation(user) celery_logger.debug( 'monitor_allocation_sources - user: %s, over_allocation: %s', user, over_allocation ) enforcement_override_choice = AllocationSourcePluginManager.get_enforcement_override( user, allocation_source ) celery_logger.debug( 'monitor_allocation_sources - enforcement_override_choice: %s', enforcement_override_choice ) if over_allocation and enforcement_override_choice == EnforcementOverrideChoice.NEVER_ENFORCE: celery_logger.debug( 'Allocation source is over allocation, but %s + user %s has an override of %s, ' 'therefore not enforcing', allocation_source, user, enforcement_override_choice ) continue if not over_allocation and enforcement_override_choice == EnforcementOverrideChoice.ALWAYS_ENFORCE: celery_logger.debug( 'Allocation source is not over allocation, but %s + user %s has an override of %s, ' 'therefore enforcing', allocation_source, user, enforcement_override_choice ) # Note: The enforcing happens in the next `if` statement. if over_allocation or enforcement_override_choice == EnforcementOverrideChoice.ALWAYS_ENFORCE: assert enforcement_override_choice in ( EnforcementOverrideChoice.NO_OVERRIDE, EnforcementOverrideChoice.ALWAYS_ENFORCE ) celery_logger.debug( 'monitor_allocation_sources - Going to enforce on user: %s', user ) allocation_source_overage_enforcement_for_user.apply_async( args=(allocation_source, user) )