def create_dsl(choices): """Creates a Reproducible Experiments Metadata file in Pithos.""" uuid = get_user_id(unmask_token(encrypt_key,choices['token'])) action_date = datetime.now().replace(microsecond=0) cluster = ClusterInfo.objects.get(id=choices['cluster_id']) data = {'cluster': {'name': cluster.cluster_name, 'project_name': cluster.project_name, 'image': cluster.os_image, 'disk_template': u'{0}'.format(cluster.disk_template), 'size': cluster.cluster_size, 'flavor_master':[cluster.cpu_master, cluster.ram_master,cluster.disk_master], 'flavor_slaves': [cluster.cpu_slaves, cluster.ram_slaves, cluster.disk_slaves]}, 'configuration': {'replication_factor': cluster.replication_factor, 'dfs_blocksize': cluster.dfs_blocksize}} if not (choices['dsl_name'].endswith('.yml') or choices['dsl_name'].endswith('.yaml')): # give file proper type choices['dsl_name'] = '{0}.yaml'.format(choices['dsl_name']) task_id = current_task.request.id dsl_id = db_dsl_create(choices, task_id) yaml_data = yaml.safe_dump(data,default_flow_style=False) url = '{0}/{1}/{2}/{3}'.format(pithos_url, uuid, choices['pithos_path'], urllib.quote(choices['dsl_name'])) headers = {'X-Auth-Token':'{0}'.format(unmask_token(encrypt_key,choices['token'])),'content-type':'text/plain'} r = requests.put(url, headers=headers, data=yaml_data) # send file to Pithos response = r.status_code if response == pithos_put_success: db_dsl_update(choices['token'],dsl_id,state='Created',dsl_data=yaml_data) return dsl_id, choices['pithos_path'], choices['dsl_name'] else: db_dsl_update(choices['token'],dsl_id,state='Failed') msg = "Failed to save experiment metadata %s to %s" % (choices['dsl_name'], choices['pithos_path']) raise ClientError(msg, error_pithos_connection)
def check_pithos_object_exists(pithos_path, dsl_name, token): """Request to Pithos to see if object exists.""" uuid = get_user_id(unmask_token(encrypt_key,token)) url = '{0}/{1}/{2}/{3}'.format(pithos_url, uuid, pithos_path, dsl_name) headers = {'X-Auth-Token':'{0}'.format(unmask_token(encrypt_key,token))} r = requests.head(url, headers=headers) response = r.status_code return response
def get_pithos_container_info(pithos_path, token): """Request to Pithos to see if container exists.""" if '/' in pithos_path: pithos_path = pithos_path.split("/", 1)[0] uuid = get_user_id(unmask_token(encrypt_key,token)) url = '{0}/{1}/{2}'.format(pithos_url, uuid, pithos_path) headers = {'X-Auth-Token':'{0}'.format(unmask_token(encrypt_key,token))} r = requests.head(url, headers=headers) response = r.status_code return response
def import_dsl(choices): """Imports a Reproducible Experiments Metadata file from Pithos.""" uuid = get_user_id(unmask_token(encrypt_key,choices['token'])) url = '{0}/{1}/{2}/{3}'.format(pithos_url, uuid, choices['pithos_path'], urllib.quote(choices['dsl_name'])) headers = {'X-Auth-Token':'{0}'.format(unmask_token(encrypt_key,choices['token']))} request = Request(url, headers=headers) try: pithos_input_stream = urlopen(request).read() task_id = current_task.request.id dsl_id = db_dsl_create(choices, task_id) db_dsl_update(choices['token'],dsl_id,state='Created',dsl_data=pithos_input_stream) return dsl_id, choices['pithos_path'], choices['dsl_name'] except HTTPError, e: raise HTTPError(e, error_import_dsl)
def scale_cluster(token, cluster_id, cluster_delta, status='Pending'): """ Scales an active cluster by cluster_delta (signed int). For scaling up finds the cluster settings and last internal ip/port slave and "appends" cluster_delta nodes. For scaling down it removes the last slave. """ from reroute_ssh import reroute_ssh_to_slaves from run_ansible_playbooks import modify_ansible_hosts_file,ansible_scale_cluster,ansible_manage_cluster cluster_to_scale = ClusterInfo.objects.get(id=cluster_id) pre_scale_size = cluster_to_scale.cluster_size previous_cluster_status = cluster_to_scale.cluster_status previous_hadoop_status = cluster_to_scale.hadoop_status status_map = {"0":"Destroyed","1":"Active","2":"Pending","3":"Failed"} # pre-flight checks. If cluster status is pending or hadoop status formatting abort. if (previous_cluster_status == const_cluster_status_pending) or (previous_hadoop_status == const_hadoop_status_format): current_task.update_state(state="Skipping") return cluster_to_scale.cluster_name # pre-flight checks done current_task.update_state(state="Started") auth = check_credentials(unmask_token(encrypt_key,token)) current_task.update_state(state="Authenticated") endpoints, user_id = endpoints_and_user_id(auth) cyclades = init_cyclades(endpoints['cyclades'], unmask_token(encrypt_key,token)) netclient = init_cyclades_netclient(endpoints['network'], unmask_token(encrypt_key,token)) plankton = init_plankton(endpoints['plankton'], unmask_token(encrypt_key,token)) state = '' list_of_new_slaves = [] cluster_name_suffix_id = '{0}-{1}'.format(cluster_to_scale.cluster_name, cluster_id) if cluster_delta < 0: # scale down for counter in range(cluster_delta,0): state = "Starting node decommission for %s" % (cluster_to_scale.cluster_name) set_cluster_state(token, cluster_id, state) try: node_fqdn, node_id = find_node_to_remove(cluster_to_scale, cyclades, netclient) state = "Decommissioning Node %s from %s" % (node_fqdn,cluster_to_scale.cluster_name) set_cluster_state(token, cluster_id, state) ansible_hosts = modify_ansible_hosts_file(cluster_name_suffix_id, action='remove_slaves', slave_hostname=node_fqdn) ansible_scale_cluster(ansible_hosts, action='remove_slaves', slave_hostname=node_fqdn.split('.')[0]) except Exception, e: msg = str(e.args[0]) set_cluster_state(token, cluster_id, state=msg, status=status_map[previous_cluster_status], error=msg) raise RuntimeError(msg) state = "Node %s decommissioned from %s and will be deleted"% (node_fqdn, cluster_to_scale.cluster_name) cluster_remove_node(node_fqdn, node_id, token, cluster_id, cluster_to_scale, cyclades, status_map[previous_cluster_status])
def ssh_key_list(self): """ Get the ssh_key dictionary of a user """ command = 'curl -X GET -H "Content-Type: application/json" -H "Accept: application/json" -H "X-Auth-Token: ' \ + unmask_token(encrypt_key, self.opts['token']) + '" https://cyclades.okeanos.grnet.gr/userdata/keys' # get ssh_keys from okeanos server p = subprocess.Popen(command, stdout=subprocess.PIPE,stderr=subprocess.PIPE , shell = True) out, err = p.communicate() # out is in string format and it might have more than one entries so we manually convert it to a easier to use output = out[2:-2].split('}, {') # [2:-2] removes [{ from the start of the string and }] from the end # split('}, {') make the string into a list o of dictionaries if multiple entries exists they are separated with }, { into different elements ssh_dict = list() ssh_counter = 0 for dictionary in output: mydict = dict() new_dictionary = dictionary.replace('"','') # When returned from curl get request each key:value pair is inside "" which we do not need so we remove those dict1 = new_dictionary.split(', ') # create a list in which each element is key: value for each in dict1: list__keys_values_in_dict = each.split(': ') # separate key from value as elements of a list new_list_of_dict_elements = list() for item in list__keys_values_in_dict: new_list_of_dict_elements.append(item) # create a list of lists with key value elements if len(new_list_of_dict_elements) > 1: for pair in new_list_of_dict_elements: mydict[new_list_of_dict_elements[0]] = new_list_of_dict_elements[1] # create a dictionary with key calue pairs ssh_dict.append(mydict) # creates a list of ssh dictionaries return ssh_dict
def check_scale_cluster_up(token, cluster_id, cluster_to_scale): """ Check user quota if new node can be added to existing cluster. Return tuple with message and value """ project_id = get_project_id(unmask_token(encrypt_key,token), cluster_to_scale.project_name) quotas = check_quota(unmask_token(encrypt_key,token), project_id) # The new node will be a slave node and will have the same flavor combination with the existing nodes of the cluster if quotas['ram']['available'] < cluster_to_scale.ram_slaves: msg = 'Not enough ram for new node.' set_cluster_state(token, cluster_id, state=msg) return (msg, error_quotas_ram) if quotas['cpus']['available'] < cluster_to_scale.cpu_slaves: msg = 'Not enough cpu for new node.' set_cluster_state(token, cluster_id, state=msg) return (msg, error_quotas_cpu) if quotas['disk']['available'] < cluster_to_scale.disk_slaves: msg = 'Not enough disk for new node.' set_cluster_state(token, cluster_id, state=msg) return (msg, error_quotas_cyclades_disk) return ('SUCCESS',0)
def destroy_server(token, id): """ Destroys a VRE server in ~okeanos """ current_task.update_state(state="Started") vre_server = VreServer.objects.get(id=id) auth = check_credentials(unmask_token(encrypt_key,token)) current_task.update_state(state="Authenticated") set_server_state(token, id, 'Deleting VRE server and its public IP') endpoints, user_id = endpoints_and_user_id(auth) cyclades = init_cyclades(endpoints['cyclades'], unmask_token(encrypt_key,token)) nc = init_cyclades_netclient(endpoints['network'], unmask_token(encrypt_key,token)) cyclades.delete_server(vre_server.server_id) new_status = cyclades.wait_server(vre_server.server_id,current_status='ACTIVE',max_wait=MAX_WAIT) if new_status != 'DELETED': state = 'Error while deleting VRE server' set_server_state(token, id, state,status='Destroyed') raise ClientError('Error while deleting VRE server', error_fatal) ip_to_delete = get_public_ip_id(nc,vre_server.server_IP) nc.delete_floatingip(ip_to_delete['id']) state= 'VRE server {0} and its public IP {1} were deleted'.format(vre_server.server_name,vre_server.server_IP) set_server_state(token, id, state, status='Destroyed') return vre_server.server_name
def __init__(self, opts): """Initialization of YarnCluster data attributes""" self.opts = opts # Master VM ip, placeholder value self.HOSTNAME_MASTER_IP = '127.0.0.1' # master VM root password file, placeholder value self.pass_file = 'PLACEHOLDER' self.orka_image_uuid = False # List of cluster VMs self.server_dict = {} if self.opts['disk_template'] == 'Archipelago': self.opts['disk_template'] = 'ext_vlmc' elif self.opts['disk_template'] == 'Standard': self.opts['disk_template'] = 'drbd' # project id of project name given as argument self.project_id = get_project_id(unmask_token(encrypt_key, self.opts['token']), self.opts['project_name']) self.status = {} # Instance of an AstakosClient object self.auth = check_credentials(unmask_token(encrypt_key, self.opts['token']), self.opts.get('auth_url', auth_url)) # Check if project has actual quota if self.check_project_quota() != 0: msg = 'Project %s exists but you have no quota to request' % \ self.opts['project_name'] raise ClientError(msg, error_project_quota) # ~okeanos endpoints and user id self.endpoints, self.user_id = endpoints_and_user_id(self.auth) # Instance of CycladesClient self.cyclades = init_cyclades(self.endpoints['cyclades'], unmask_token(encrypt_key, self.opts['token'])) # Instance of CycladesNetworkClient self.net_client = init_cyclades_netclient(self.endpoints['network'], unmask_token(encrypt_key, self.opts['token'])) # Instance of Plankton/ImageClient self.plankton = init_plankton(self.endpoints['plankton'], unmask_token(encrypt_key, self.opts['token'])) # Get resources of pending clusters self.pending_quota = retrieve_pending_clusters(unmask_token(encrypt_key, self.opts['token']), self.opts['project_name']) self._DispatchCheckers = {} self._DispatchCheckers[len(self._DispatchCheckers) + 1] =\ self.check_cluster_size_quotas # Check for private network availability only when cluster is created # and not for Vre server creation if self.opts['cluster_size'] > 1: self._DispatchCheckers[len(self._DispatchCheckers) + 1] =\ self.check_network_quotas self._DispatchCheckers[len(self._DispatchCheckers) + 1] =\ self.check_ip_quotas self._DispatchCheckers[len(self._DispatchCheckers) + 1] =\ self.check_cpu_valid self._DispatchCheckers[len(self._DispatchCheckers) + 1] =\ self.check_ram_valid self._DispatchCheckers[len(self._DispatchCheckers) + 1] =\ self.check_disk_valid
def project_list_flavor_quota(user): """Creates the list of resources for every project a user has quota""" okeanos_token = unmask_token(encrypt_key, user.okeanos_token) list_of_resources = list() flavors = get_flavor_lists(okeanos_token) auth = check_credentials(okeanos_token) ssh_info = ssh_key_list(okeanos_token) ssh_keys_names =list() dict_quotas = auth.get_quotas() try: list_of_projects = auth.get_projects(state='active') except ClientError: msg = ' Could not get list of projects' raise ClientError(msg, error_get_list_projects) # Id for ember-data, will use it for store.push the different projects ember_project_id = 1 for item in ssh_info: # find the names of available ssh keys if item.has_key('name'): ssh_keys_names.append(item['name']) for project in list_of_projects: # Put system project in the first place of project list if project['name'] == 'system:'+str(project['id']): list_of_projects.remove(project) list_of_projects.insert(0,project) for project in list_of_projects: if project['id'] in dict_quotas: quotas = check_quota(okeanos_token, project['id']) images = check_images(okeanos_token, project['id']) list_of_resources.append(retrieve_ClusterCreationParams(flavors, quotas, images, project['name'], user, ember_project_id, ssh_keys_names)) ember_project_id = ember_project_id + 1 return list_of_resources
def ansible_create_cluster(hosts_filename, cluster_size, orka_image_uuid, ssh_file, token, replication_factor, dfs_blocksize, admin_password): """ Calls the ansible playbook that installs and configures hadoop and everything needed for hadoop to be functional. hosts_filename is the name of ansible_hosts file. If a specific hadoop image was used in the VMs creation, ansible playbook will not install Hadoop-YARN and will only perform the appropriate configuration. """ logging.log(REPORT, ' Ansible starts YARN installation on master and ' 'slave nodes') level = logging.getLogger().getEffectiveLevel() # chosen image includes role and tags properties image_tags = get_image_category(image_uuid=orka_image_uuid) decoded_image_tags = decode_json(image_tags['ansible_cluster_config_tags']) # Create debug file for ansible debug_file_name = "create_cluster_debug_" + hosts_filename.split(ansible_hosts_prefix, 1)[1] + ".log" ansible_log = " >> " + os.path.join(LOGS_PATH, debug_file_name) # find ansible playbook (site.yml) uuid = UserInfo.objects.get(okeanos_token=token).uuid # Create command that executes ansible playbook ansible_code = 'ansible-playbook -i {0} {1} {2} '.format(hosts_filename, ansible_playbook, ansible_verbosity) + \ '-f {0} -e "choose_role={1} ssh_file_name={2} token={3} '.format(str(cluster_size), decoded_image_tags['role'], ssh_file, unmask_token(encrypt_key, token)) + \ 'dfs_blocksize={0}m dfs_replication={1} uuid={2} admin_password={3}" {4}'.format(dfs_blocksize, replication_factor, uuid, admin_password, decoded_image_tags['tags']) # Execute ansible ansible_code += ansible_log execute_ansible_playbook(ansible_code)
def destroy_cluster(token, cluster_id, master_IP='', status='Destroyed'): """ Destroys cluster and deletes network and floating IP. Finds the machines that belong to the cluster from the cluster id that is given. Cluster id is the unique integer that each cluster has in escience database. """ cluster_to_delete = ClusterInfo.objects.get(id=cluster_id) cluster_name = cluster_to_delete.cluster_name # cluster exists on cyclades, operate on ~okeanos infrastructure for removal, update database current_task.update_state(state="Started") servers_to_delete = [] if cluster_to_delete.master_IP: float_ip_to_delete = cluster_to_delete.master_IP else: float_ip_to_delete = master_IP list_of_errors = [] master_id = None network_to_delete_id = None float_ip_to_delete_id = None new_status = 'placeholder' auth = check_credentials(unmask_token(encrypt_key,token)) current_task.update_state(state="Authenticated") endpoints, user_id = endpoints_and_user_id(auth) cyclades = init_cyclades(endpoints['cyclades'], unmask_token(encrypt_key,token)) nc = init_cyclades_netclient(endpoints['network'], unmask_token(encrypt_key,token)) # Get list of servers and public IPs try: list_of_servers = cyclades.list_servers(detail=True) except ClientError: msg = 'Could not get list of resources.'\ 'Cannot delete cluster' raise ClientError(msg, error_get_list_servers) # Get master virtual machine and network from IP ip = get_public_ip_id(nc, float_ip_to_delete) float_ip_to_delete_id = ip['id'] master_id = ip['instance_id'] master_server = cyclades.get_server_details(master_id) for attachment in master_server['attachments']: if (attachment['OS-EXT-IPS:type'] == 'fixed' and not attachment['ipv6']): network_to_delete_id = attachment['network_id'] break # Show an error message and exit if not valid IP or network if not master_id: msg = '[%s] is not the valid public IP of the master' % \ float_ip_to_delete raise ClientError(msg, error_get_ip) if not network_to_delete_id: cyclades.delete_server(master_id) set_cluster_state(token, cluster_id, "Deleted master VM", status=status) msg = 'A valid network of master and slaves was not found.'\ 'Deleting the master VM only' raise ClientError(msg, error_cluster_corrupt) # Get the servers of the cluster to be deleted for server in list_of_servers: for attachment in server['attachments']: if attachment['network_id'] == network_to_delete_id: servers_to_delete.append(server) break number_of_nodes = len(servers_to_delete) set_cluster_state(token, cluster_id, "Starting deletion of requested cluster") # Start cluster deleting try: for server in servers_to_delete: cyclades.delete_server(server['id']) state= 'Deleting %d servers ' % number_of_nodes set_cluster_state(token, cluster_id, state) # Wait for every server of the cluster to be deleted for server in servers_to_delete: new_status = cyclades.wait_server(server['id'], current_status='ACTIVE', max_wait=MAX_WAIT) if new_status != 'DELETED': logging.error('Error deleting server [%s]' % server['name']) list_of_errors.append(error_cluster_corrupt) set_cluster_state(token, cluster_id, 'Deleting cluster network and public IP') except ClientError: logging.exception('Error in deleting server') list_of_errors.append(error_cluster_corrupt) try: nc.delete_network(network_to_delete_id) state= 'Network with id [%s] is deleted' % network_to_delete_id set_cluster_state(token, cluster_id, state) sleep(10) # Take some time to ensure it is deleted except ClientError: logging.exception('Error in deleting network') list_of_errors.append(error_cluster_corrupt) # Delete the floating IP of deleted cluster try: nc.delete_floatingip(float_ip_to_delete_id) state= 'Floating IP [%s] is deleted' % float_ip_to_delete logging.log(SUMMARY, state) set_cluster_state(token, cluster_id, state) except ClientError: logging.exception('Error in deleting floating IP [%s]' % float_ip_to_delete) list_of_errors.append(error_cluster_corrupt) state= 'Cluster with public IP [%s] was deleted ' % float_ip_to_delete set_cluster_state(token, cluster_id, state, status=status) # status is already destroyed or failed, only clean up database if cluster_to_delete.cluster_status not in [const_cluster_status_active,const_cluster_status_pending]: current_task.update_state(state="Removing Record") try: db_cluster_delete(token,cluster_id) current_task.update_state(state="Cluster Record Removed") except Exception,e: msg = str(e.args[0]) raise ClientError(msg, error_cluster_corrupt)
def cluster_add_node(token, cluster_id, cluster_to_scale, cyclades, netclient, plankton, status): """ Create a VM in ~okeanos and attach it to the network of the requested cluster. """ new_slave = {} server_home_path = expanduser('~') server_ssh_keys = join(server_home_path, ".ssh/id_rsa.pub") pub_keys_path = '' project_id = get_project_id(unmask_token(encrypt_key,token), cluster_to_scale.project_name) node_name = cluster_to_scale.cluster_name + '-' + str(cluster_to_scale.cluster_size + 1) # name of new node should follow the naming convention of the rest nodes in the cluster state = "Adding new datanode {0}".format(node_name) set_cluster_state(token, cluster_id, state) try: flavor_list = cyclades.list_flavors(True) except ClientError: msg = 'Could not get list of flavors' raise ClientError(msg, error_flavor_list) for flavor in flavor_list: # The new node will be a slave node and will have the same flavor combination with the existing nodes of the cluster if flavor['ram'] == cluster_to_scale.ram_slaves and \ flavor['SNF:disk_template'] == cluster_to_scale.disk_template and \ flavor['vcpus'] == cluster_to_scale.cpu_slaves and \ flavor['disk'] == cluster_to_scale.disk_slaves: flavor_id = flavor['id'] chosen_image = {} list_current_images = plankton.list_public(True, 'default') # Find image id of the operating system arg given for lst in list_current_images: # new node should have the same image as the other cluster nodes if lst['name'] == cluster_to_scale.os_image: chosen_image = lst chosen_image_id = chosen_image['id'] if not chosen_image: msg = ' Image not found.' raise ClientError(msg, error_image_id) master_id = None network_to_edit_id = None new_status = 'placeholder' # Get master virtual machine and network from IP ip = get_public_ip_id(netclient, cluster_to_scale.master_IP) master_id = ip['instance_id'] master_server = cyclades.get_server_details(master_id) for attachment in master_server['attachments']: if (attachment['OS-EXT-IPS:type'] == 'fixed' and not attachment['ipv6']): network_to_edit_id = attachment['network_id'] break # Create new node with create server methode of cyclades new_server = cyclades.create_server(node_name, flavor_id, chosen_image_id, personality=personality(server_ssh_keys,pub_keys_path), networks=[{"uuid": network_to_edit_id}], project_id=project_id) new_status = cyclades.wait_server(new_server['id'], max_wait=MAX_WAIT) if new_status != 'ACTIVE': msg = ' Status for server [%s] is %s. Server will be deleted' % \ (servers[i]['name'], new_status) cyclades.delete_server(new_server['id']) raise ClientError(msg, error_create_server) cluster_to_scale.cluster_size = cluster_to_scale.cluster_size + 1 # new cluster size cluster_to_scale.save() new_slave_private_ip = '192.168.0.{0}'.format(str(1 + cluster_to_scale.cluster_size)) # Add new node to network new_slave_port = ADD_TO_GET_PORT + cluster_to_scale.cluster_size state = "New datanode {0} was added to cluster network".format(node_name) set_cluster_state(token, cluster_id, state, status='Active') new_slave = {'id':new_server['id'], 'fqdn': new_server['SNF:fqdn'],'private_ip': new_slave_private_ip, 'password': new_server['adminPass'],'port': new_slave_port,'uuid': new_server['user_id'], 'image_id':new_server['image']['id']} return new_slave