def prep_instance_for_snapshot(identity_id, instance_id, **celery_task_args): identity = Identity.objects.get(id=identity_id) try: celery_logger.debug("prep_instance_for_snapshot task started at %s." % timezone.now()) # NOTE: FIXMEIF the assumption that the 'linux username' # is the 'created_by' AtmosphereUser changes. username = identity.created_by.username driver = get_esh_driver(identity) instance = driver.get_instance(instance_id) if instance.extra.get('status', '') != 'active': celery_logger.info("prep_instance_for_snapshot skipped") return playbooks = deploy_prepare_snapshot(instance.ip, username, instance_id) celery_logger.info(playbooks.__dict__) hostname = build_host_name(instance.id, instance.ip) result = False if execution_has_failures(playbooks, hostname)\ or execution_has_unreachable(playbooks, hostname) else True if not result: raise Exception( "Error encountered while preparing instance for snapshot: %s" % playbooks.stats.summarize(host=hostname)) except Exception as exc: celery_logger.warn(exc) prep_instance_for_snapshot.retry(exc=exc)
def umount_task(driverCls, provider, identity, instance_id, volume_id, *args, **kwargs): try: celery_logger.debug("umount_task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) attach_data = volume.extra["attachments"][0] device = attach_data["device"] # Check mount to find the mount_location for device private_key = "/opt/dev/atmosphere/extras/ssh/id_rsa" kwargs.update({"ssh_key": private_key}) kwargs.update({"timeout": 120}) mount_location = None cm_script = check_mount() kwargs.update({"deploy": cm_script}) driver.deploy_to(instance, **kwargs) regex = re.compile("(?P<device>[\w/]+) on (?P<location>.*) type") for line in cm_script.stdout.split("\n"): res = regex.search(line) if not res: continue search_dict = res.groupdict() dev_found = search_dict["device"] if device == dev_found: mount_location = search_dict["location"] break # Volume not mounted, move along.. if not mount_location: return um_script = umount_volume(device) kwargs.update({"deploy": um_script}) driver.deploy_to(instance, **kwargs) if "device is busy" in um_script.stdout: # Show all processes that are making device busy.. lsof_script = lsof_location(mount_location) kwargs.update({"deploy": lsof_script}) driver.deploy_to(instance, **kwargs) regex = re.compile("(?P<name>[\w]+)\s*(?P<pid>[\d]+)") offending_processes = [] for line in lsof_script.stdout.split("\n"): res = regex.search(line) if not res: continue search_dict = res.groupdict() offending_processes.append((search_dict["name"], search_dict["pid"])) raise DeviceBusyException(mount_location, offending_processes) # Return here if no errors occurred.. celery_logger.debug("umount_task finished at %s." % datetime.now()) except DeviceBusyException: raise except Exception as exc: celery_logger.warn(exc) umount_task.retry(exc=exc)
def get_shared_identities(account_driver, cloud_machine, tenant_id_name_map): """ INPUT: Provider, Cloud Machine (private), mapping of tenant_id to tenant_name OUTPUT: List of identities that *include* the 'tenant name' credential matched to 'a shared user' in openstack. """ from core.models import Identity cloud_membership = account_driver.image_manager.shared_images_for( image_id=cloud_machine.id) # NOTE: the START type of 'all_identities' is list (in case no ValueListQuerySet is ever found) all_identities = [] for cloud_machine_membership in cloud_membership: tenant_id = cloud_machine_membership.member_id tenant_name = tenant_id_name_map.get(tenant_id) if not tenant_name: celery_logger.warn("TENANT ID: %s NOT FOUND - %s" % (tenant_id, cloud_machine_membership)) continue # Find matching 'tenantName' credential and add all matching identities w/ that tenantName. matching_creds = Credential.objects.filter( key='ex_tenant_name', # TODO: ex_project_name on next OStack update. value=tenant_name, # NOTE: re-add this line when not replicating clouds! #identity__provider=account_driver.core_provider) ) identity_ids = matching_creds.values_list('identity', flat=True) if not all_identities: all_identities = identity_ids else: all_identities = all_identities | identity_ids identity_list = Identity.objects.filter(id__in=all_identities) return identity_list
def unmount_volume_task(driverCls, provider, identity, instance_id, volume_id, *args, **kwargs): try: celery_logger.debug("unmount task started at %s." % timezone.now()) driver = get_driver(driverCls, provider, identity) username = identity.get_username() instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) device_location = None try: attach_data = volume.extra['attachments'][0] device_location = attach_data['device'] except (KeyError, IndexError): celery_logger.warn("Volume %s missing attachments in Extra" % (volume, )) if not device_location: raise Exception( "No device_location found or inferred by volume %s" % volume) try: playbooks = deploy_unmount_volume(instance.ip, username, instance.id, device_location) except DeviceBusyException: # Future-Fixme: Update VolumeStatusHistory.extra, set status to 'unmount_failed' raise hostname = build_host_name(instance.id, instance.ip) result = False if execution_has_failures(playbooks, hostname)\ or execution_has_unreachable(playbooks, hostname) else True if not result: raise Exception("Error encountered while unmounting volume: %s" % playbooks.stats.summarize(host=hostname)) return device_location except Exception as exc: celery_logger.warn(exc) unmount_volume_task.retry(exc=exc)
def prep_instance_for_snapshot(identity_id, instance_id, **celery_task_args): identity = Identity.objects.get(id=identity_id) try: celery_logger.debug("prep_instance_for_snapshot task started at %s." % timezone.now()) # NOTE: FIXMEIF the assumption that the 'linux username' # is the 'created_by' AtmosphereUser changes. username = identity.created_by.username driver = get_esh_driver(identity) instance = driver.get_instance(instance_id) if instance.extra.get('status','') != 'active': celery_logger.info("prep_instance_for_snapshot skipped") return playbooks = deploy_prepare_snapshot( instance.ip, username, instance_id) celery_logger.info(playbooks.__dict__) hostname = build_host_name(instance.id, instance.ip) result = False if execution_has_failures(playbooks, hostname)\ or execution_has_unreachable(playbooks, hostname) else True if not result: raise Exception( "Error encountered while preparing instance for snapshot: %s" % playbooks.stats.summarize(host=hostname)) except Exception as exc: celery_logger.warn(exc) prep_instance_for_snapshot.retry(exc=exc)
def mount_volume_task( driverCls, provider, identity, instance_id, volume_id, device_location, mount_location, device_type, mount_prefix=None, *args, **kwargs ): try: celery_logger.debug("mount task started at %s." % timezone.now()) celery_logger.debug("mount_location: %s" % (mount_location, )) driver = get_driver(driverCls, provider, identity) username = identity.get_username() instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) try: attach_data = volume.extra['attachments'][0] if not device_location: device_location = attach_data['device'] except (KeyError, IndexError): celery_logger.warn( "Volume %s missing attachments in Extra" % (volume, ) ) if not device_location: raise Exception( "No device_location found or inferred by volume %s" % volume ) if not mount_prefix: mount_prefix = "/vol_" last_char = device_location[-1] # /dev/sdb --> b if not mount_location: mount_location = mount_prefix + last_char playbook_results = deploy_mount_volume( instance.ip, username, instance.id, device_location, mount_location=mount_location, device_type=device_type ) celery_logger.info(playbook_results) if execution_has_failures( playbook_results ) or execution_has_unreachable(playbook_results): raise Exception( "Error encountered while mounting volume: instance_id: {}, volume_id: {}" .format(instance_id, volume_id) ) return mount_location except Exception as exc: celery_logger.warn(exc) mount_volume_task.retry(exc=exc)
def update_image_membership(account_driver, cloud_machine, db_machine): """ Given a cloud_machine and db_machine, create any relationships possible for ProviderMachineMembership and ApplicationVersionMembership Return a list of all group names who have been given share access. """ image_visibility = cloud_machine.get('visibility','private') if image_visibility.lower() == 'public': return shared_project_names = _get_all_access_list(account_driver, db_machine, cloud_machine) #Future-FIXME: This logic expects project_name == Group.name # When this changes, logic should update to include checks for: # - Lookup Identities with this project_name # - Share with group that has IdentityMembership # - Alternatively, consider changing ProviderMachineMembership # to point to Identity for a 1-to-1 mapping. groups = Group.objects.filter(name__in=shared_project_names) # THIS IS A HACK - some images have been 'compromised' in this event, reset the access list _back_ to the last-known-good configuration, based on a machine request. has_machine_request = MachineRequest.objects.filter( new_machine__instance_source__identifier=cloud_machine.id, status__name='completed').last() parent_app = db_machine.application_version.application if len(shared_project_names) > 128: celery_logger.warn("Application %s has too many shared users. Consider running 'prune_machines' to cleanup", parent_app) if not has_machine_request: return access_list = has_machine_request.get_access_list() shared_project_names = access_list #ENDHACK for group in groups: update_db_membership_for_group(db_machine, group) return groups
def distribute_image_membership(account_driver, cloud_machine, provider): """ Based on what we know about the DB, at a minimum, ensure that their projects are added to the image_members list for this cloud_machine. """ pm = ProviderMachine.objects.get( instance_source__provider=provider, instance_source__identifier=cloud_machine.id ) group_ids = ProviderMachineMembership.objects.filter(provider_machine=pm ).values_list( 'group', flat=True ) groups = Group.objects.filter(id__in=group_ids) for group in groups: try: celery_logger.info( "Add %s to cloud membership for %s" % (group, pm) ) update_cloud_membership_for_machine(pm, group) except TimeoutError: celery_logger.warn( "Failed to add cloud membership for %s - Operation timed out" % group ) return groups
def mount_failed(task_uuid, driverCls, provider, identity, volume_id, unmount=False, **celery_task_args): from service import volume as volume_service try: celery_logger.debug("mount_failed task started at %s." % datetime.now()) celery_logger.info("task_uuid=%s" % task_uuid) result = app.AsyncResult(task_uuid) with allow_join_result(): exc = result.get(propagate=False) err_str = "Mount Error Traceback:%s" % (result.traceback, ) celery_logger.error(err_str) driver = get_driver(driverCls, provider, identity) volume = driver.get_volume(volume_id) if unmount: tmp_status = 'umount_error' else: tmp_status = 'mount_error' return volume_service._update_volume_metadata( driver, volume, metadata={'tmp_status': tmp_status}) celery_logger.debug("mount_failed task finished at %s." % datetime.now()) except Exception as exc: celery_logger.warn(exc) mount_failed.retry(exc=exc)
def mount_failed( context, exception_msg, traceback, driverCls, provider, identity, volume_id, unmount=False, **celery_task_args ): from service import volume as volume_service try: celery_logger.debug("mount_failed task started at %s." % timezone.now()) celery_logger.info("task context=%s" % context) err_str = "%s\nMount Error Traceback:%s" % (exception_msg, traceback) celery_logger.error(err_str) driver = get_driver(driverCls, provider, identity) volume = driver.get_volume(volume_id) if unmount: tmp_status = 'umount_error' else: tmp_status = 'mount_error' return volume_service._update_volume_metadata( driver, volume, metadata={'tmp_status': tmp_status} ) except Exception as exc: celery_logger.warn(exc) mount_failed.retry(exc=exc)
def validate_new_image(image_id, machine_request_id): machine_request = MachineRequest.objects.get(id=machine_request_id) new_status, _ = StatusType.objects.get_or_create(name="validating") machine_request.status = new_status machine_request.old_status = 'validating' machine_request.save() accounts = get_account_driver(machine_request.new_machine.provider) accounts.clear_cache() from service.instance import launch_machine_instance admin_driver = accounts.admin_driver admin_ident = machine_request.new_admin_identity() if not admin_driver: celery_logger.warn( "Need admin_driver functionality to auto-validate instance") return False if not admin_ident: celery_logger.warn( "Need to know the AccountProvider to auto-validate instance") return False # Attempt to launch using the admin_driver admin_driver.identity.user = admin_ident.created_by machine = admin_driver.get_machine(image_id) small_size = admin_driver.list_sizes()[0] instance = launch_machine_instance(admin_driver, admin_ident, machine, small_size, 'Automated Image Verification - %s' % image_id, username='******', using_admin=True) return instance.id
def validate_new_image(image_id, machine_request_id): machine_request = MachineRequest.objects.get(id=machine_request_id) new_status, _ = StatusType.objects.get_or_create(name="validating") machine_request.status = new_status machine_request.old_status = 'validating' machine_request.save() accounts = get_account_driver(machine_request.new_machine.provider) accounts.clear_cache() from service.instance import launch_machine_instance admin_driver = accounts.admin_driver admin_ident = machine_request.new_admin_identity() if not admin_driver: celery_logger.warn( "Need admin_driver functionality to auto-validate instance") return False if not admin_ident: celery_logger.warn( "Need to know the AccountProvider to auto-validate instance") return False # Attempt to launch using the admin_driver admin_driver.identity.user = admin_ident.created_by machine = admin_driver.get_machine(image_id) small_size = admin_driver.list_sizes()[0] instance = launch_machine_instance( admin_driver, admin_ident, machine, small_size, 'Automated Image Verification - %s' % image_id, username='******', using_admin=True) return instance.id
def _share_image(account_driver, cloud_machine, identity, members, dry_run=False): """ INPUT: use account_driver to share cloud_machine with identity (if not in 'members' list) """ # Skip tenant-names who are NOT in the DB, and tenants who are already included missing_tenant = identity.credential_set.filter(~Q(value__in=members), key='ex_tenant_name') if missing_tenant.count() == 0: #celery_logger.debug("SKIPPED _ Image %s already shared with %s" % (cloud_machine.id, identity)) return elif missing_tenant.count() > 1: raise Exception("Safety Check -- You should not be here") tenant_name = missing_tenant[0] cloud_machine_is_public = cloud_machine.is_public if hasattr(cloud_machine,'is_public') else cloud_machine.get('visibility','') == 'public' if cloud_machine_is_public == True: celery_logger.info("Making Machine %s private" % cloud_machine.id) if not dry_run: account_driver.image_manager.glance.images.update(cloud_machine.id, visibility='shared') celery_logger.info("Sharing image %s<%s>: %s with %s" % (cloud_machine.id, cloud_machine.name, identity.provider.location, tenant_name.value)) if not dry_run: try: account_driver.image_manager.share_image(cloud_machine, tenant_name.value) except GlanceConflict as exc: if 'already associated with image' in exc.message: pass except GlanceForbidden as exc: if 'Public images do not have members' in exc.message: celery_logger.warn("CONFLICT -- This image should have been marked 'shared'! %s" % cloud_machine) pass return
def _share_image(account_driver, cloud_machine, identity, members, dry_run=False): """ INPUT: use account_driver to share cloud_machine with identity (if not in 'members' list) """ # Skip tenant-names who are NOT in the DB, and tenants who are already included missing_tenant = identity.credential_set.filter(~Q(value__in=members), key='ex_tenant_name') if missing_tenant.count() == 0: #celery_logger.debug("SKIPPED _ Image %s already shared with %s" % (cloud_machine.id, identity)) return elif missing_tenant.count() > 1: raise Exception("Safety Check -- You should not be here") tenant_name = missing_tenant[0] cloud_machine_is_public = cloud_machine.is_public if hasattr(cloud_machine,'is_public') else cloud_machine.get('visibility','') == 'public' if cloud_machine_is_public == True: celery_logger.info("Making Machine %s private" % cloud_machine.id) account_driver.image_manager.glance.images.update(cloud_machine.id, visibility='private') celery_logger.info("Sharing image %s<%s>: %s with %s" % (cloud_machine.id, cloud_machine.name, identity.provider.location, tenant_name.value)) if not dry_run: try: account_driver.image_manager.share_image(cloud_machine, tenant_name.value) except GlanceConflict as exc: if 'already associated with image' in exc.message: pass except GlanceForbidden as exc: if 'Public images do not have members' in exc.message: celery_logger.warn("CONFLICT -- This image should have been marked 'private'! %s" % cloud_machine) pass return
def attach_task(driverCls, provider, identity, instance_id, volume_id, device_choice=None, *args, **kwargs): try: celery_logger.debug("attach_task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) # Step 1. Attach the volume # NOTE: device_choice !== device 100% driver.attach_volume(instance, volume, device_choice) # When the reslt returns the volume will be 'attaching' # We can't do anything until the volume is 'available/in-use' attempts = 0 while True: volume = driver.get_volume(volume_id) # Give up if you can't find the volume if not volume: return None if attempts > 6: # After 6 attempts (~1min) break # Openstack Check if isinstance(driver, OSDriver) and\ 'attaching' not in volume.extra.get('status', ''): break if isinstance(driver, EucaDriver) and\ 'attaching' not in volume.extra.get('status', ''): break # Exponential backoff.. attempts += 1 sleep_time = 2**attempts celery_logger.debug("Volume %s is not ready (%s). Sleep for %s" % (volume.id, volume.extra.get('status', 'no-status'), sleep_time)) time.sleep(sleep_time) if 'available' in volume.extra.get('status', ''): raise Exception("Volume %s failed to attach to instance %s" % (volume.id, instance.id)) # Device path for euca == openstack try: attach_data = volume.extra['attachments'][0] device = attach_data['device'] except (IndexError, KeyError) as bad_fetch: celery_logger.warn("Could not find 'device' in " "volume.extra['attachments']: " "Volume:%s Extra:%s" % (volume.id, volume.extra)) device = None celery_logger.debug("attach_task finished at %s." % datetime.now()) return device except Exception as exc: celery_logger.warn(exc) attach_task.retry(exc=exc)
def monitor_volumes_for(provider_id, print_logs=False): """ Run the set of tasks related to monitoring sizes for a provider. Optionally, provide a list of usernames to monitor While debugging, print_logs=True can be very helpful. start_date and end_date allow you to search a 'non-standard' window of time. """ from service.driver import get_account_driver from core.models import Identity if print_logs: console_handler = _init_stdout_logging() provider = Provider.objects.get(id=provider_id) account_driver = get_account_driver(provider) # Non-End dated volumes on this provider db_volumes = Volume.objects.filter(only_current_source(), instance_source__provider=provider) all_volumes = account_driver.admin_driver.list_all_volumes(timeout=30) seen_volumes = [] for cloud_volume in all_volumes: try: core_volume = convert_esh_volume(cloud_volume, provider_uuid=provider.uuid) seen_volumes.append(core_volume) except ObjectDoesNotExist: tenant_id = cloud_volume.extra['object']['os-vol-tenant-attr:tenant_id'] tenant = account_driver.get_project_by_id(tenant_id) tenant_name = tenant.name if tenant else tenant_id try: if not tenant: celery_logger.warn("Warning: tenant_id %s found on volume %s, but did not exist from the account driver perspective.", tenant_id, cloud_volume) raise ObjectDoesNotExist() identity = Identity.objects.filter( contains_credential('ex_project_name', tenant_name), provider=provider ).first() if not identity: raise ObjectDoesNotExist() core_volume = convert_esh_volume( cloud_volume, provider.uuid, identity.uuid, identity.created_by) except ObjectDoesNotExist: celery_logger.info("Skipping Volume %s - No Identity for: Provider:%s + Project Name:%s" % (cloud_volume.id, provider, tenant_name)) pass now_time = timezone.now() needs_end_date = [volume for volume in db_volumes if volume not in seen_volumes] for volume in needs_end_date: celery_logger.debug("End dating inactive volume: %s" % volume) volume.end_date = now_time volume.save() if print_logs: _exit_stdout_logging(console_handler) for vol in seen_volumes: vol.esh = None return [vol.instance_source.identifier for vol in seen_volumes]
def validate_new_image(image_id, machine_request_id): machine_request = MachineRequest.objects.get(id=machine_request_id) new_status, _ = StatusType.objects.get_or_create(name="validating") machine_request.status = new_status machine_request.old_status = 'validating' local_username = machine_request.created_by.username #NOTE: Change local_username accordingly when this assumption is no longer true. machine_request.save() accounts = get_account_driver(machine_request.new_machine.provider) accounts.clear_cache() from service.instance import launch_machine_instance admin_driver = accounts.admin_driver admin_ident = machine_request.new_admin_identity() if not admin_driver: celery_logger.warn( "Need admin_driver functionality to auto-validate instance") return False if not admin_ident: celery_logger.warn( "Need to know the AccountProvider to auto-validate instance") return False # Attempt to launch using the admin_driver user = admin_ident.created_by admin_driver.identity.user = user machine = admin_driver.get_machine(image_id) sorted_sizes = admin_driver.list_sizes() size_index = 0 while size_index < len(sorted_sizes): selected_size = sorted_sizes[size_index] size_index += 1 try: instance = launch_machine_instance( admin_driver, user, admin_ident, machine, selected_size, 'Automated Image Verification - %s' % image_id, username=local_username, using_admin=True) return instance.provider_alias except BaseHTTPError as http_error: if "Flavor's disk is too small for requested image" in http_error.message: continue logger.exception(http_error) raise except Exception as exc: logger.exception(exc) raise # End of while loop raise Exception("Validation of new Image %s has *FAILED*" % image_id)
def monitor_machines_for(provider_id, print_logs=False, dry_run=False): """ Run the set of tasks related to monitoring machines for a provider. Optionally, provide a list of usernames to monitor While debugging, print_logs=True can be very helpful. start_date and end_date allow you to search a 'non-standard' window of time. NEW LOGIC: * Membership and Privacy is dictated at the APPLICATION level. * loop over all machines on the cloud * * If machine is PUBLIC, ensure the APP is public. * * If machine is PRIVATE, ensure the APP is private && sync the membership! * * Ignore the possibility of conflicts, prior schema should be sufficient for ensuring the above two usecases """ provider = Provider.objects.get(id=provider_id) if print_logs: console_handler = _init_stdout_logging() #STEP 1: get the apps new_public_apps, private_apps = get_public_and_private_apps(provider) #STEP 2: Find conflicts and report them. intersection = set(private_apps.keys()) & set(new_public_apps) if intersection: celery_logger.error("These applications were listed as BOTH public && private apps. Manual conflict correction required: %s" % intersection) #STEP 3: Apply the changes at app-level #Memoization at this high of a level will help save time account_drivers = {} # Provider -> accountDriver provider_tenant_mapping = {} # Provider -> [{TenantId : TenantName},...] image_maps = {} if settings.ENFORCING: for app in new_public_apps: if app in intersection: celery_logger.error("Skipped public app: %s <%s>" % (app, app.id)) continue make_machines_public(app, account_drivers, dry_run=dry_run) for app, membership in private_apps.items(): if app in intersection: celery_logger.error("Skipped private app: %s <%s>" % (app, app.id)) continue make_machines_private(app, membership, account_drivers, provider_tenant_mapping, image_maps, dry_run=dry_run) else: # settings.ENFORCING = False celery_logger.warn("Settings.ENFORCING is set to False -- So we assume this is a development build and *NO* changes should be made to glance as a result of an 'information mismatch'") if print_logs: _exit_stdout_logging(console_handler) return
def remove_empty_networks_for(provider_id): provider = Provider.objects.get(id=provider_id) os_driver = get_account_driver(provider) if not os_driver: celery_logger.warn( "Cannot remove_empty_networks_for provider %s -- Account Driver not created" % provider) return all_instances = os_driver.admin_driver.list_all_instances() project_map = os_driver.network_manager.project_network_map() known_project_names = Credential.objects.filter( key='ex_project_name').values_list('value', flat=True) projects_with_networks = sorted( [k for k in project_map.keys() if k in known_project_names]) for project in projects_with_networks: networks = project_map[project]['network'] if not isinstance(networks, list): networks = [networks] for network in networks: network_name = network['name'] celery_logger.debug("Checking if network %s is in use" % network_name) if running_instances(network_name, all_instances): continue user = project identity = Identity.objects.filter( provider_id=provider_id, credential__key='ex_project_name', credential__value=project).filter( credential__key='key', credential__value=user).first() if not identity: celery_logger.warn( "NOT Removing project network for User:%s, Project:%s -- No Valid Identity found!" % (user, project)) continue try: celery_logger.debug( "Removing project network for User:%s, Project:%s" % (user, project)) os_driver.delete_user_network(identity) except NeutronClientException: celery_logger.exception("Neutron unable to remove project" "network for %s-%s" % (user, project)) except NeutronException: celery_logger.exception("Neutron unable to remove project" "network for %s-%s" % (user, project))
def check_volume_task(driverCls, provider, identity, instance_id, volume_id, *args, **kwargs): try: celery_logger.debug("check_volume task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) attach_data = volume.extra['attachments'][0] device = attach_data['device'] private_key = ATMOSPHERE_PRIVATE_KEYFILE kwargs.update({'ssh_key': private_key}) kwargs.update({'timeout': 120}) # One script to make two checks: # 1. Voume exists 2. Volume has a filesystem cv_script = check_volume(device) # NOTE: non_zero_deploy needed to stop LibcloudDeploymentError from being # raised kwargs.update({'deploy': cv_script, 'non_zero_deploy': True}) driver.deploy_to(instance, **kwargs) kwargs.pop('non_zero_deploy', None) # Script execute if cv_script.exit_status != 0: if 'No such file' in cv_script.stdout: raise Exception('Volume check failed: %s. ' 'Device %s does not exist on instance %s' % (volume, device, instance)) elif 'Bad magic number' in cv_script.stdout: # Filesystem needs to be created for this device celery_logger.info("Mkfs needed") mkfs_script = mkfs_volume(device) kwargs.update({'deploy': mkfs_script}) driver.deploy_to(instance, **kwargs) else: raise Exception('Volume check failed: Something weird') celery_logger.debug("check_volume task finished at %s." % datetime.now()) except LibcloudDeploymentError as exc: celery_logger.exception(exc) except Exception as exc: celery_logger.warn(exc) check_volume_task.retry(exc=exc)
def check_volume_task( driverCls, provider, identity, instance_id, volume_id, device_type='ext4', *args, **kwargs ): try: celery_logger.debug("check_volume task started at %s." % timezone.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) username = identity.get_username() attach_data = volume.extra['attachments'][0] device_location = attach_data['device'] celery_logger.info("device_location: %s" % device_location) # One playbook to make two checks: # 1. Voume exists # 2. Volume has a filesystem # (If not, create one of type 'device_type') playbook_results = deploy_check_volume( instance.ip, username, instance.id, device_location, device_type=device_type ) success = not ( execution_has_failures(playbook_results) or execution_has_unreachable(playbook_results) ) if not success: raise Exception( "Error encountered while checking volume for filesystem: instance_id: {}, volume_id: {}" .format(instance_id, volume_id) ) return success except Exception as exc: celery_logger.warn(exc) check_volume_task.retry(exc=exc)
def set_machine_request_metadata(machine_request, image_id): admin_driver = get_admin_driver(machine_request.new_machine_provider) machine = admin_driver.get_machine(image_id) lc_driver = admin_driver._connection if not machine: celery_logger.warn("Could not find machine with ID=%s" % image_id) return if not hasattr(lc_driver, 'ex_set_image_metadata'): return metadata = lc_driver.ex_get_image_metadata(machine) if machine_request.new_application_description: metadata['description'] = machine_request.new_application_description if machine_request.new_version_tags: metadata['tags'] = machine_request.new_version_tags celery_logger.info("LC Driver:%s - Machine:%s - Metadata:%s" % (lc_driver, machine.id, metadata)) lc_driver.ex_set_image_metadata(machine, metadata) return machine
def check_volume_task(driverCls, provider, identity, instance_id, volume_id, device_type='ext4', *args, **kwargs): try: celery_logger.debug("check_volume task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) username = identity.get_username() attach_data = volume.extra['attachments'][0] device_location = attach_data['device'] celery_logger.info("device_location: %s" % device_location) # One playbook to make two checks: # 1. Voume exists # 2. Volume has a filesystem # (If not, create one of type 'device_type') playbooks = deploy_check_volume(instance.ip, username, instance.id, device_location, device_type=device_type) celery_logger.info(playbooks.__dict__) hostname = build_host_name(instance.id, instance.ip) result = False if execution_has_failures(playbooks, hostname)\ or execution_has_unreachable(playbooks, hostname) else True if not result: raise Exception( "Error encountered while checking volume for filesystem: %s" % playbooks.stats.summarize(host=hostname)) return result except LibcloudDeploymentError as exc: celery_logger.exception(exc) except Exception as exc: celery_logger.warn(exc) check_volume_task.retry(exc=exc)
def validate_new_image(image_id, machine_request_id): machine_request = MachineRequest.objects.get(id=machine_request_id) new_status, _ = StatusType.objects.get_or_create(name="validating") machine_request.status = new_status machine_request.old_status = 'validating' machine_request.save() accounts = get_account_driver(machine_request.new_machine.provider) accounts.clear_cache() from service.instance import launch_machine_instance admin_driver = accounts.admin_driver admin_ident = machine_request.new_admin_identity() if not admin_driver: celery_logger.warn( "Need admin_driver functionality to auto-validate instance") return False if not admin_ident: celery_logger.warn( "Need to know the AccountProvider to auto-validate instance") return False # Attempt to launch using the admin_driver admin_driver.identity.user = admin_ident.created_by machine = admin_driver.get_machine(image_id) sorted_sizes = admin_driver.list_sizes() size_index = 0 while size_index < len(sorted_sizes): selected_size = sorted_sizes[size_index] size_index += 1 try: instance = launch_machine_instance( admin_driver, admin_ident, machine, selected_size, 'Automated Image Verification - %s' % image_id, username='******', using_admin=True) return instance.id except Exception as exc: raise # FIXME: Determine if this exception is based on 'size too small' logger.exception(exc) pass raise Exception("Validation of new Image %s has *FAILED*" % image_id)
def unmount_volume_task( driverCls, provider, identity, instance_id, volume_id, *args, **kwargs ): try: celery_logger.debug("unmount task started at %s." % timezone.now()) driver = get_driver(driverCls, provider, identity) username = identity.get_username() instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) device_location = None try: attach_data = volume.extra['attachments'][0] device_location = attach_data['device'] except (KeyError, IndexError): celery_logger.warn( "Volume %s missing attachments in Extra" % (volume, ) ) if not device_location: raise Exception( "No device_location found or inferred by volume %s" % volume ) try: playbook_results = deploy_unmount_volume( instance.ip, username, instance.id, device_location ) except DeviceBusyException: # Future-Fixme: Update VolumeStatusHistory.extra, set status to 'unmount_failed' raise if execution_has_failures( playbook_results ) or execution_has_unreachable(playbook_results): raise Exception( "Error encountered while unmounting volume: instance_id: {}, volume_id: {}" .format(instance_id, volume_id) ) return device_location except Exception as exc: celery_logger.warn(exc) unmount_volume_task.retry(exc=exc)
def mount_failed(task_uuid, driverCls, provider, identity, volume_id, unmount=False, **celery_task_args): from service import volume as volume_service try: celery_logger.debug("mount_failed task started at %s." % datetime.now()) celery_logger.info("task_uuid=%s" % task_uuid) result = app.AsyncResult(task_uuid) with allow_join_result(): exc = result.get(propagate=False) err_str = "Mount Error Traceback:%s" % (result.traceback,) celery_logger.error(err_str) driver = get_driver(driverCls, provider, identity) volume = driver.get_volume(volume_id) if unmount: tmp_status = "umount_error" else: tmp_status = "mount_error" return volume_service.update_volume_metadata(driver, volume, metadata={"tmp_status": tmp_status}) celery_logger.debug("mount_failed task finished at %s." % datetime.now()) except Exception as exc: celery_logger.warn(exc) mount_failed.retry(exc=exc)
def validate_new_image(image_id, machine_request_id): if not getattr(settings, 'ENABLE_IMAGE_VALIDATION', True): celery_logger.warn("Skip validation: ENABLE_IMAGE_VALIDATION is False") return True machine_request = MachineRequest.objects.get(id=machine_request_id) new_status, _ = StatusType.objects.get_or_create(name="validating") machine_request.status = new_status machine_request.old_status = 'validating' local_username = machine_request.created_by.username #NOTE: Change local_username accordingly when this assumption is no longer true. machine_request.save() accounts = get_account_driver(machine_request.new_machine.provider) accounts.clear_cache() from service.instance import launch_machine_instance admin_driver = accounts.admin_driver admin_ident = machine_request.new_admin_identity() if not admin_driver: celery_logger.warn( "Need admin_driver functionality to auto-validate instance" ) return False if not admin_ident: celery_logger.warn( "Need to know the AccountProvider to auto-validate instance" ) return False # Attempt to launch using the admin_driver user = admin_ident.created_by admin_driver.identity.user = user machine = admin_driver.get_machine(image_id) sorted_sizes = admin_driver.list_sizes() size_index = 0 while size_index < len(sorted_sizes): selected_size = sorted_sizes[size_index] size_index += 1 try: instance = launch_machine_instance( admin_driver, user, admin_ident, machine, selected_size, 'Automated Image Verification - %s' % image_id, username=local_username, using_admin=True ) return instance.provider_alias except BaseHTTPError as http_error: if "Flavor's disk is too small for requested image" in http_error.message: continue logger.exception(http_error) raise except Exception as exc: logger.exception(exc) raise # End of while loop raise Exception("Validation of new Image %s has *FAILED*" % image_id)
def umount_task(driverCls, provider, identity, instance_id, volume_id, *args, **kwargs): try: celery_logger.debug("umount_task started at %s." % datetime.now()) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) attach_data = volume.extra['attachments'][0] device = attach_data['device'] # Check mount to find the mount_location for device private_key = "/opt/dev/atmosphere/extras/ssh/id_rsa" kwargs.update({'ssh_key': private_key}) kwargs.update({'timeout': 120}) mount_location = None cm_script = check_mount() kwargs.update({'deploy': cm_script}) driver.deploy_to(instance, **kwargs) regex = re.compile("(?P<device>[\w/]+) on (?P<location>.*) type") for line in cm_script.stdout.split('\n'): res = regex.search(line) if not res: continue search_dict = res.groupdict() dev_found = search_dict['device'] if device == dev_found: mount_location = search_dict['location'] break # Volume not mounted, move along.. if not mount_location: return um_script = umount_volume(device) kwargs.update({'deploy': um_script}) driver.deploy_to(instance, **kwargs) if 'device is busy' in um_script.stdout: # Show all processes that are making device busy.. lsof_script = lsof_location(mount_location) kwargs.update({'deploy': lsof_script}) driver.deploy_to(instance, **kwargs) regex = re.compile("(?P<name>[\w]+)\s*(?P<pid>[\d]+)") offending_processes = [] for line in lsof_script.stdout.split('\n'): res = regex.search(line) if not res: continue search_dict = res.groupdict() offending_processes.append( (search_dict['name'], search_dict['pid'])) raise DeviceBusyException(mount_location, offending_processes) # Return here if no errors occurred.. celery_logger.debug("umount_task finished at %s." % datetime.now()) except DeviceBusyException: raise except Exception as exc: celery_logger.warn(exc) umount_task.retry(exc=exc)
def mount_task(driverCls, provider, identity, instance_id, volume_id, device=None, mount_location=None, *args, **kwargs): try: celery_logger.debug("mount task started at %s." % datetime.now()) celery_logger.debug("mount_location: %s" % (mount_location, )) driver = get_driver(driverCls, provider, identity) instance = driver.get_instance(instance_id) volume = driver.get_volume(volume_id) username = identity.get_username() # DEV NOTE: Set as 'users' because this is a GUARANTEED group # and we know our 'user' will exist (if atmo_init_full was executed) # in case the VM does NOT rely on iPlant LDAP groupname = "users" celery_logger.debug(volume) try: attach_data = volume.extra['attachments'][0] if not device: device = attach_data['device'] except KeyError as IndexError: celery_logger.warn("Volume %s missing attachments in Extra" % (volume,)) device = None if not device: celery_logger.warn("Device never attached. Nothing to mount") return None private_key = "/opt/dev/atmosphere/extras/ssh/id_rsa" kwargs.update({'ssh_key': private_key}) kwargs.update({'timeout': 120}) # Step 2. Check the volume is not already mounted cm_script = check_mount() kwargs.update({'deploy': cm_script}) driver.deploy_to(instance, **kwargs) if device in cm_script.stdout: mount_location = _parse_mount_location(cm_script.stdout, device) if not mount_location: raise Exception("Device already mounted, " "but mount location could not be determined!" "Check _parse_mount_location()!") celery_logger.warn( "Device already mounted. Mount output:%s" % cm_script.stdout) # Device has already been mounted. Move along.. return mount_location # Step 3. Find a suitable location to mount the volume celery_logger.info("Original mount location - %s" % mount_location) if not mount_location: inc = 1 while True: if '/vol%s' % inc in cm_script.stdout: inc += 1 else: break mount_location = '/vol%s' % inc celery_logger.info("Device location - %s" % device) celery_logger.info("New mount location - %s" % mount_location) mv_script = mount_volume(device, mount_location, username, groupname) kwargs.update({'deploy': mv_script}) driver.deploy_to(instance, **kwargs) celery_logger.debug("mount task finished at %s." % datetime.now()) return mount_location except Exception as exc: celery_logger.warn(exc) mount_task.retry(exc=exc)
def monitor_machines_for(provider_id, print_logs=False, dry_run=False): """ Run the set of tasks related to monitoring machines for a provider. Optionally, provide a list of usernames to monitor While debugging, print_logs=True can be very helpful. start_date and end_date allow you to search a 'non-standard' window of time. NEW LOGIC: * Membership and Privacy is dictated at the APPLICATION level. * loop over all machines on the cloud * * If machine is PUBLIC, ensure the APP is public. * * If machine is PRIVATE, ensure the APP is private && sync the membership! * * Ignore the possibility of conflicts, prior schema should be sufficient for ensuring the above two usecases """ provider = Provider.objects.get(id=provider_id) if print_logs: console_handler = _init_stdout_logging() #STEP 1: get the apps new_public_apps, private_apps = get_public_and_private_apps(provider) #STEP 2: Find conflicts and report them. intersection = set(private_apps.keys()) & set(new_public_apps) if intersection: celery_logger.error( "These applications were listed as BOTH public && private apps. Manual conflict correction required: %s" % intersection) #STEP 3: Apply the changes at app-level #Memoization at this high of a level will help save time account_drivers = {} # Provider -> accountDriver provider_tenant_mapping = {} # Provider -> [{TenantId : TenantName},...] image_maps = {} if settings.ENFORCING: for app in new_public_apps: if app in intersection: celery_logger.error("Skipped public app: %s <%s>" % (app, app.id)) continue make_machines_public(app, account_drivers, dry_run=dry_run) for app, membership in private_apps.items(): if app in intersection: celery_logger.error("Skipped private app: %s <%s>" % (app, app.id)) continue make_machines_private(app, membership, account_drivers, provider_tenant_mapping, image_maps, dry_run=dry_run) else: # settings.ENFORCING = False celery_logger.warn( "Settings.ENFORCING is set to False -- So we assume this is a development build and *NO* changes should be made to glance as a result of an 'information mismatch'" ) if print_logs: _exit_stdout_logging(console_handler) return