def delete_infrastructure(infra_id): """ Delete the specified infrastructure """ logger = custom_logger.CustomAdapter(logging.getLogger(__name__), {'id': infra_id}) if 'type' not in request.args: db = database.get_db() if db.connect(): # Get current status of infrastructure (_, status, _, _, _) = db.deployment_get_im_infra_id(infra_id) # If it has already been deleted, don't do anything but return success if status == 'deleted': db.close() logger.info('Infrastructure has already been deleted') return jsonify({}), 200 elif status == 'deletion-requested': db.close() logger.info( 'Infrastructure deletion has already been requested') return jsonify({}), 200 success = db.deployment_update_status_with_retries( infra_id, 'deletion-requested') if success: db.close() logger.info( 'Infrastructure deletion request successfully initiated') return jsonify({}), 200 logger.critical( 'Infrastructure deletion request failed, possibly a database issue' ) return jsonify({}), 400 elif request.args.get('type') == 'im': cloud = request.args.get('cloud') db = database.get_db() if db.connect(): clouds_info_list = utilities.create_clouds_list( CONFIG.get('clouds', 'path')) token = tokens.get_token(cloud, None, db, clouds_info_list) db.close() im_auth = utilities.create_im_auth(cloud, token, clouds_info_list) client = imclient.IMClient(url=CONFIG.get('im', 'url'), data=im_auth) (status, msg) = client.getauth() if status != 0: logger.critical('Error reading IM auth file: %s', msg) return jsonify({}), 400 client.destroy(infra_id, 30) return jsonify({}), 200 return jsonify({}), 400
def delete_ansible_node(cloud, identity, db): """ Delete an Ansible node for the specified cloud """ # Get details about the node (infrastructure_id, public_ip, username, timestamp) = db.get_ansible_node(cloud) if not infrastructure_id: logger.critical( '[delete_ansible_node] Unable to get infrastructure id for Ansible node in cloud %s', cloud) return False logger.info( '[delete_ansible_node] About to delete Ansible node from clouds %s with infrastructure id %s', cloud, infrastructure_id) # Get full list of cloud info clouds_info_list = utilities.create_clouds_list( CONFIG.get('clouds', 'path')) # Get a token if necessary token = tokens.get_token(cloud, identity, db, clouds_info_list) # Destroy infrastructure im_auth = utilities.create_im_auth(cloud, token, clouds_info_list) client = imclient.IMClient(url=CONFIG.get('im', 'url'), data=im_auth) (status, msg) = client.getauth() if status != 0: logger.critical('Error reading IM auth file: %s', msg) return False (return_code, msg) = client.destroy(infrastructure_id, int(CONFIG.get('timeouts', 'deletion'))) if return_code != 0: logger.critical( 'Unable to destroy Ansible node infrastructure with id "%s" on cloud "%s" due to "%s"', infrastructure_id, cloud, msg) # Delete from the DB db.delete_ansible_node(cloud) return True
def get_infrastructures(): """ Get list of infrastructures in the specified state or type """ if 'status' in request.args and 'type' not in request.args: cloud = None if 'cloud' in request.args: cloud = request.args.get('cloud') db = database.get_db() if db.connect(): infra = db.deployment_get_infra_in_state_cloud( request.args.get('status'), cloud) db.close() return jsonify(infra), 200 elif 'type' in request.args and 'cloud' in request.args: if request.args.get('type') == 'im': cloud = request.args.get('cloud') db = database.get_db() if db.connect(): clouds_info_list = utilities.create_clouds_list( CONFIG.get('clouds', 'path')) token = tokens.get_token(cloud, None, db, clouds_info_list) db.close() im_auth = utilities.create_im_auth(cloud, token, clouds_info_list) client = imclient.IMClient(url=CONFIG.get('im', 'url'), data=im_auth) (status, msg) = client.getauth() if status != 0: logger.critical('Error reading IM auth file: %s', msg) return jsonify({}), 400 (status, ids) = client.list_infra_ids(10) im_list = [] if ids: for uri in ids: pieces = uri.split('/') im_id = pieces[len(pieces) - 1] im_list.append(im_id) return jsonify(im_list), 200 return jsonify({}), 400
def update_clouds_status(opa_client, db, identity, config): """ Update status of each cloud """ for cloud_info in config: name = cloud_info['name'] logger.info('Checking cloud %s', name) # Get a token if necessary token = tokens.get_token(name, identity, db, config) try: status = check_cloud(name, cloud_info, token) except timeout_decorator.timeout_decorator.TimeoutError: logger.info('Setting status of cloud %s to down due to timeout', name) opa_client.set_status(name, 'down') else: if not status: logger.info('Setting status of cloud %s to down', name) opa_client.set_status(name, 'down')
def update_cloud_details(requirements, db, identity, opa_client, config): """ Update cloud images & flavours if necessary """ for cloud in config: name = cloud['name'] # Check if we need to consider this cloud at all if 'sites' in requirements: if name not in requirements['sites']: continue logger.info('Checking if we need to update cloud %s details', name) # Check if the cloud hasn't been updated recently update_time = opa_client.get_cloud_update_time(name) requires_update = False if time.time() - update_time > int(CONFIG.get('updates', 'vms')): logger.info( 'Images and flavours for cloud %s have not been updated recently', name) requires_update = True else: continue # Get a token if necessary logger.info('Getting a new token if necessary') token = tokens.get_token(name, identity, db, config) # Get new images & flavours logger.info('Getting list of new images and flavours') try: new_data = generate_images_and_flavours(cloud, name, token) except timeout_decorator.timeout_decorator.TimeoutError: new_data = {'images': {}, 'flavours': {}} # Check if need to continue with this cloud if not new_data['images'] and not new_data['flavours']: logger.info( 'Not continuing with considering updating details for cloud %s as there is no data', name) continue # Get old images & flavours try: images_old = opa_client.get_images(name) except Exception as err: logger.critical('Unable to get images due to :%s', err) return False try: flavours_old = opa_client.get_flavours(name) except Exception as err: logger.critical('Unable to get flavours due to %s:', err) return False # Update cloud VM images if necessary if (not images_old or requires_update or not compare_dicts( images_old, new_data['images'])) and new_data['images']: if not compare_dicts(images_old, new_data['images']): logger.info('Updating images for cloud %s', name) opa_client.set_images(name, new_data['images']) else: logger.info( 'Images for cloud %s have not changed, not updating', name) opa_client.set_update_time(name) # Update cloud VM flavours if necessary if (not flavours_old or requires_update or not compare_dicts( flavours_old, new_data['flavours'])) and new_data['flavours']: if not compare_dicts(flavours_old, new_data['flavours']): logger.info('Updating flavours for cloud %s', name) opa_client.set_flavours(name, new_data['flavours']) else: logger.info( 'Flavours for cloud %s have not changed, not updating', name) opa_client.set_update_time(name) return True
def deploy(radl, cloud, time_begin, unique_id, identity, db, num_nodes=1): """ Deploy infrastructure from a specified RADL file """ # Get full list of cloud info clouds_info_list = utilities.create_clouds_list( CONFIG.get('clouds', 'path')) # Check & get auth token if necessary token = tokens.get_token(cloud, identity, db, clouds_info_list) # Setup Open Policy Agent client opa_client = opaclient.OPAClient(url=CONFIG.get('opa', 'url'), timeout=int(CONFIG.get('opa', 'timeout'))) # Setup Infrastructure Manager client im_auth = utilities.create_im_auth(cloud, token, clouds_info_list) client = imclient.IMClient(url=CONFIG.get('im', 'url'), data=im_auth) (status, msg) = client.getauth() if status != 0: logger.critical('Error reading IM auth file: %s', msg) return None # Create RADL content for initial deployment: for multiple nodes we strip out all configure/contextualize # blocks and will add this back in once we have successfully deployed all required VMs if num_nodes > 1: radl_base = utilities.create_basic_radl(radl) else: radl_base = radl # Set availability zone in RADL if necessary cloud_info = opa_client.get_cloud(cloud) if 'availability_zones' in cloud_info: availability_zones = cloud_info['availability_zones'] if availability_zones: random.shuffle(availability_zones) logger.info('Using availability zone %s', availability_zones[0]) radl_base = utilities.set_availability_zone( radl_base, availability_zones[0]) retries_per_cloud = int(CONFIG.get('deployment', 'retries')) retry = 0 success = False time_begin_this_cloud = time.time() # Retry loop while retry < retries_per_cloud + 1 and not success: if retry > 0: time.sleep(int(CONFIG.get('polling', 'duration'))) logger.info('Deployment attempt %d of %d', retry + 1, retries_per_cloud + 1) retry += 1 # Check if we should stop (im_infra_id_new, infra_status_new, cloud_new, _, _) = db.deployment_get_im_infra_id(unique_id) if infra_status_new in ('deletion-requested', 'deleted', 'deletion-failed', 'deleting'): logger.info( 'Deletion requested of infrastructure, aborting deployment') return None # Create infrastructure (infrastructure_id, msg) = client.create(radl_base, int(CONFIG.get('timeouts', 'creation'))) if infrastructure_id: logger.info( 'Created infrastructure on cloud %s with IM id %s and waiting for it to be configured', cloud, infrastructure_id) db.deployment_update_status_with_retries(unique_id, None, cloud, infrastructure_id) time_created = time.time() count_unconfigured = 0 state_previous = None fnodes_to_be_replaced = 0 wnodes_to_be_replaced = 0 initial_step_complete = False multi_node_deletions = 0 # Wait for infrastructure to enter the configured state while True: # Sleep time.sleep(int(CONFIG.get('polling', 'duration'))) # Check if we should stop (im_infra_id_new, infra_status_new, cloud_new, _, _) = db.deployment_get_im_infra_id(unique_id) if infra_status_new in ('deletion-requested', 'deleted', 'deletion-failed', 'deleting'): logger.info( 'Deletion requested of infrastructure so aborting deployment' ) return None # Don't spend too long trying to create infrastructure, give up eventually if time.time() - time_begin > int( CONFIG.get('timeouts', 'total')): logger.info( 'Giving up, total time waiting is too long, so will destroy infrastructure with IM id %s', infrastructure_id) destroy.destroy(client, infrastructure_id) return None # Get the current overall state & states of all VMs in the infrastructure (states, msg) = client.getstates(infrastructure_id, int(CONFIG.get('timeouts', 'status'))) # If state is not known, wait if not states: logger.info( 'State is not known for infrastructure with id %s on cloud %s', infrastructure_id, cloud) continue # Overall state of infrastructure state = None have_nodes = -1 if 'state' in states: if 'state' in states['state']: state = states['state']['state'] if 'vm_states' in states['state']: have_nodes = len(states['state']['vm_states']) # If the state or number of nodes is unknown, wait if not state or have_nodes == -1: logger.warning( 'Unable to determine state and/or number of VMs from IM' ) continue # Log a change in state if state != state_previous: logger.info('Infrastructure with IM id %s is in state %s', infrastructure_id, state) state_previous = state # Handle difference situation when state is configured if state == 'configured': logger.info( 'State is configured, NumNodesWanted=%d, NumNodesHave=%d, InitialStepComplete=%d', num_nodes, have_nodes, initial_step_complete) # The final configured state if num_nodes == 1 or (num_nodes > 1 and initial_step_complete): logger.info( 'Successfully configured infrastructure on cloud %s, took %d secs', cloud, time.time() - time_begin_this_cloud) success = True return infrastructure_id # Configured state for initial step of multi-node infrastructure if num_nodes > 1 and have_nodes == num_nodes and not initial_step_complete: logger.info( 'Successfully configured basic infrastructure on cloud %s, will now apply final configuration', cloud) initial_step_complete = True radl_final = '' for line in radl.split('\n'): if line.startswith('deploy'): line = '' radl_final += '%s\n' % line (exit_code, msg) = client.reconfigure_new( infrastructure_id, radl_final, int(CONFIG.get('timeouts', 'reconfigure'))) # Configured state but some nodes failed and were deleted if num_nodes > 1 and have_nodes < num_nodes and not initial_step_complete: logger.info( 'Infrastructure is now in the configured state but need to re-create failed VMs' ) if fnodes_to_be_replaced > 0: logger.info('Creating %d fnodes', fnodes_to_be_replaced) radl_new = '' for line in radl_base.split('\n'): if line.startswith('deploy wnode'): line = '' if line.startswith('deploy fnode'): line = 'deploy fnode %d\n' % fnodes_to_be_replaced radl_new += '%s\n' % line fnodes_to_be_replaced = 0 (exit_code, msg) = client.add_resource( infrastructure_id, radl_new, 120) if wnodes_to_be_replaced > 0: logger.info('Creating %d wnodes', wnodes_to_be_replaced) radl_new = '' for line in radl_base.split('\n'): if line.startswith('deploy fnode'): line = '' if line.startswith('deploy wnode'): line = 'deploy wnode %d\n' % wnodes_to_be_replaced radl_new += '%s\n' % line wnodes_to_be_replaced = 0 (exit_code, msg) = client.add_resource( infrastructure_id, radl_new, 120) # Destroy infrastructure which is taking too long to enter the configured state if time.time() - time_created > int( CONFIG.get('timeouts', 'configured')): logger.warning( 'Waiting too long for infrastructure to be configured, so destroying' ) opa_client.set_status(cloud, 'configuration-too-long') destroy.destroy(client, infrastructure_id) break # Destroy infrastructure which is taking too long to enter the running state if time.time() - time_created > int( CONFIG.get('timeouts', 'notrunning') ) and state != 'running' and state != 'unconfigured' and num_nodes == 1: logger.warning( 'Waiting too long for infrastructure to enter the running state, so destroying' ) opa_client.set_status(cloud, 'pending-too-long') destroy.destroy(client, infrastructure_id) break # FIXME: This factor of 3 is a hack if time.time() - time_created > 3 * int( CONFIG.get('timeouts', 'notrunning') ) and state != 'running' and state != 'unconfigured' and num_nodes > 1: logger.warning( 'Waiting too long for infrastructure to enter the running state, so destroying' ) opa_client.set_status(cloud, 'pending-too-long') destroy.destroy(client, infrastructure_id) break # Destroy infrastructure for which deployment failed if state == 'failed': if num_nodes > 1: logger.info( 'Infrastructure creation failed for some VMs on cloud %s, so deleting these (run %d)', cloud, multi_node_deletions) multi_node_deletions += 1 failed_vms = 0 for vm_id in states['state']['vm_states']: if states['state']['vm_states'][vm_id] == 'failed': logger.info('Deleting VM with id %d', int(vm_id)) failed_vms += 1 # Determine what type of node (fnode or wnode) (exit_code, vm_info) = client.get_vm_info( infrastructure_id, int(vm_id), int(CONFIG.get('timeouts', 'deletion'))) # FIXME - is found_vm really needed? found_vm = False for info in vm_info['radl']: if 'state' in info and 'id' in info: found_vm = True if 'fnode' in info['id']: fnodes_to_be_replaced += 1 else: wnodes_to_be_replaced += 1 if not found_vm: logger.warn( 'Unable to determine type of VM') # Delete the VM (exit_code, msg_remove) = client.remove_resource( infrastructure_id, int(vm_id), int(CONFIG.get('timeouts', 'deletion'))) logger.info( 'Deleted %d failed VMs from infrastructure', failed_vms) # Check if we have deleted all VMs: in this case IM will return 'unknown' as the status # so it's best to just start again if failed_vms == num_nodes: logger.warning( 'All VMs failed and deleted, so destroying infrastructure' ) opa_client.set_status(cloud, state) destroy.destroy(client, infrastructure_id) break continue else: logger.warning( 'Infrastructure creation failed on cloud %s, so destroying', cloud) opa_client.set_status(cloud, state) destroy.destroy(client, infrastructure_id) break # Handle unconfigured infrastructure if state == 'unconfigured': count_unconfigured += 1 file_unconf = '%s/contmsg-%s-%d.txt' % (CONFIG.get( 'logs', 'contmsg'), unique_id, time.time()) contmsg = client.getcontmsg( infrastructure_id, int(CONFIG.get('timeouts', 'deletion'))) if count_unconfigured < int( CONFIG.get('deployment', 'reconfigures')) + 1: logger.warning( 'Infrastructure on cloud %s is unconfigured, will try reconfiguring after writing contmsg to a file', cloud) try: with open(file_unconf, 'w') as unconf: unconf.write(contmsg) except Exception as error: logger.warning('Unable to write contmsg to file') client.reconfigure( infrastructure_id, int(CONFIG.get('timeouts', 'reconfigure'))) else: logger.warning( 'Infrastructure has been unconfigured too many times, so destroying after writing contmsg to a file' ) opa_client.set_status(cloud, state) try: with open(file_unconf, 'w') as unconf: unconf.write(contmsg) except Exception as error: logger.warning('Unable to write contmsg to file') destroy.destroy(client, infrastructure_id) break else: logger.warning( 'Deployment failure on cloud %s with id %s with msg="%s"', cloud, infrastructure_id, msg) if msg == 'timedout': logger.warning( 'Infrastructure creation failed due to a timeout') opa_client.set_status(cloud, 'creation-timeout') else: file_failed = '%s/failed-%s-%d.txt' % (CONFIG.get( 'logs', 'contmsg'), unique_id, time.time()) opa_client.set_status(cloud, 'creation-failed') logger.warning( 'Infrastructure creation failed, writing stdout/err to file "%s"', file_failed) try: with open(file_failed, 'w') as failed: failed.write(msg) except Exception as error: logger.warning('Unable to write contmsg to file') return None
def delete(unique_id): """ Delete the infrastructure with the specified id """ logger.info('Deleting infrastructure') db = database.get_db() db.connect() (im_infra_id, infra_status, cloud, _, _) = db.deployment_get_im_infra_id(unique_id) logger.info('Obtained IM id %s and cloud %s and status %s', im_infra_id, cloud, infra_status) # Get full list of cloud info clouds_info_list = utilities.create_clouds_list( CONFIG.get('clouds', 'path')) # Deterime resource type resource_type = None for cloud_info in clouds_info_list: if cloud_info['name'] == cloud: resource_type = cloud_info['type'] if im_infra_id and cloud: if resource_type == 'cloud': match_obj_name = re.match( r'\b[0-9a-f]{8}\b-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-\b[0-9a-f]{12}\b', im_infra_id) if match_obj_name: logger.info('Deleting cloud infrastructure with IM id %s', im_infra_id) # Get the identity of the user who created the infrastructure identity = db.deployment_get_identity(unique_id) # Check & get auth token if necessary token = tokens.get_token(cloud, identity, db, clouds_info_list) # Setup Infrastructure Manager client im_auth = utilities.create_im_auth(cloud, token, clouds_info_list) client = imclient.IMClient(url=CONFIG.get('im', 'url'), data=im_auth) (status, msg) = client.getauth() if status != 0: logger.critical('Error reading IM auth file: %s', msg) db.close() return False destroyed = destroy(client, im_infra_id) if destroyed: db.deployment_update_status_with_retries( unique_id, 'deleted') logger.info( 'Destroyed infrastructure with IM infrastructure id %s', im_infra_id) else: db.deployment_update_status_with_retries( unique_id, 'deletion-failed') logger.critical( 'Unable to destroy infrastructure with IM infrastructure id %s', im_infra_id) return False else: logger.critical('IM infrastructure id %s does not match regex', im_infra_id) db.deployment_update_status_with_retries(unique_id, 'deleted') elif resource_type == 'batch': match_obj_name = re.match(r'[\d]+', im_infra_id) if match_obj_name: logger.info( 'Deleting batch infrastructure with HTCondor job id %s', im_infra_id) #client = htcondorclient.HTCondorClient() #client.destroy(int(im_infra_id)) else: logger.info( 'No need to destroy infrastructure because resource infrastructure id is %s, resource name is %s, resource type is %s', im_infra_id, cloud, resource_type) db.deployment_update_status_with_retries(unique_id, 'deleted') db.close() return True
def set_quotas(requirements, db, identity, opa_client, config): """ Determine the available remaining quotas and set in Open Policy Agent """ for cloud in config: name = cloud['name'] credentials = cloud['credentials'] instances = None cores = None memory = None # Check if we need to consider this cloud at all if 'sites' in requirements: if name not in requirements['sites']: continue # Get a token if necessary token = tokens.get_token(name, identity, db, config) if credentials['type'] == 'OpenStack': # Get a scoped token if necessary from Keystone if token: logger.info('Getting a scoped token from Keystone') try: token = tokens.get_scoped_token( credentials['host'], credentials['project_id'], tokens.get_unscoped_token(credentials['host'], token, credentials['username'], credentials['tenant'])) except timeout_decorator.timeout_decorator.TimeoutError: logger.critical( 'Unable to get a scoped token from Keystone due to a timeout' ) continue # Check if the cloud hasn't been updated recently logger.info('Checking if we need to update cloud %s quotas', name) try: update_time = opa_client.get_quota_update_time(name) except Exception as err: logger.critical('Unable to get quota update time due to:', err) return False if time.time() - update_time > int(CONFIG.get('updates', 'quotas')): logger.info( 'Quotas for cloud %s have not been updated recently, so getting current values', name) try: (instances, cores, memory) = get_quotas_openstack(name, credentials, token) except timeout_decorator.timeout_decorator.TimeoutError: (instances, cores, memory) = (None, None, None) elif credentials['type'] != 'InfrastructureManager': logger.warning( 'Unable to determine quotas for cloud %s of type %s', name, credentials['type']) if instances and cores and memory: logger.info( 'Setting updated quotas for cloud %s: instances %d, cpus %d, memory %d', name, instances, cores, memory) opa_client.set_quotas(name, instances, cores, memory) else: logger.info('Not setting updated quotas for cloud %s', name) return True