def rollback_scale_cluster(list_of_slaves, cyclades, cluster_to_scale, size, ansible=False): """ Rollback cluster when scale add node fail. More rollback actions when ansible has failed during hadoop configurations for the new nodes. """ from run_ansible_playbooks import modify_ansible_hosts_file,ansible_scale_cluster cluster_name_suffix_id = '{0}-{1}'.format(cluster_to_scale.cluster_name, cluster_to_scale.id) for slave in list_of_slaves: cyclades.delete_server(slave['id']) if ansible: for slave in list_of_slaves: modify_ansible_hosts_file(cluster_name_suffix_id, action='remove_slaves', slave_hostname=slave['fqdn']) ansible_hosts = modify_ansible_hosts_file(cluster_name_suffix_id, action='join_slaves') ansible_scale_cluster(ansible_hosts, action='rollback_scale_cluster') cluster_to_scale.cluster_size = size cluster_to_scale.save()
def scale_cluster(token, cluster_id, cluster_delta, status='Pending'): """ Scales an active cluster by cluster_delta (signed int). For scaling up finds the cluster settings and last internal ip/port slave and "appends" cluster_delta nodes. For scaling down it removes the last slave. """ from reroute_ssh import reroute_ssh_to_slaves from run_ansible_playbooks import modify_ansible_hosts_file,ansible_scale_cluster,ansible_manage_cluster cluster_to_scale = ClusterInfo.objects.get(id=cluster_id) pre_scale_size = cluster_to_scale.cluster_size previous_cluster_status = cluster_to_scale.cluster_status previous_hadoop_status = cluster_to_scale.hadoop_status status_map = {"0":"Destroyed","1":"Active","2":"Pending","3":"Failed"} # pre-flight checks. If cluster status is pending or hadoop status formatting abort. if (previous_cluster_status == const_cluster_status_pending) or (previous_hadoop_status == const_hadoop_status_format): current_task.update_state(state="Skipping") return cluster_to_scale.cluster_name # pre-flight checks done current_task.update_state(state="Started") auth = check_credentials(unmask_token(encrypt_key,token)) current_task.update_state(state="Authenticated") endpoints, user_id = endpoints_and_user_id(auth) cyclades = init_cyclades(endpoints['cyclades'], unmask_token(encrypt_key,token)) netclient = init_cyclades_netclient(endpoints['network'], unmask_token(encrypt_key,token)) plankton = init_plankton(endpoints['plankton'], unmask_token(encrypt_key,token)) state = '' list_of_new_slaves = [] cluster_name_suffix_id = '{0}-{1}'.format(cluster_to_scale.cluster_name, cluster_id) if cluster_delta < 0: # scale down for counter in range(cluster_delta,0): state = "Starting node decommission for %s" % (cluster_to_scale.cluster_name) set_cluster_state(token, cluster_id, state) try: node_fqdn, node_id = find_node_to_remove(cluster_to_scale, cyclades, netclient) state = "Decommissioning Node %s from %s" % (node_fqdn,cluster_to_scale.cluster_name) set_cluster_state(token, cluster_id, state) ansible_hosts = modify_ansible_hosts_file(cluster_name_suffix_id, action='remove_slaves', slave_hostname=node_fqdn) ansible_scale_cluster(ansible_hosts, action='remove_slaves', slave_hostname=node_fqdn.split('.')[0]) except Exception, e: msg = str(e.args[0]) set_cluster_state(token, cluster_id, state=msg, status=status_map[previous_cluster_status], error=msg) raise RuntimeError(msg) state = "Node %s decommissioned from %s and will be deleted"% (node_fqdn, cluster_to_scale.cluster_name) cluster_remove_node(node_fqdn, node_id, token, cluster_id, cluster_to_scale, cyclades, status_map[previous_cluster_status])
image_id = new_slave['image_id'] linux_dist = get_system_dist(cluster_to_scale.os_image) try: for new_slave in list_of_new_slaves: reroute_ssh_to_slaves(new_slave['port'], new_slave['private_ip'], master_ip, new_slave['password'], '',linux_dist) except Exception, e: msg = '{0}. Scale action failed. Cluster rolled back'.format(str(e.args[0])) set_cluster_state(token, cluster_id, msg) rollback_scale_cluster(list_of_new_slaves, cyclades, cluster_to_scale, pre_scale_size) set_cluster_state(token, cluster_id, state=msg, status=status_map[previous_cluster_status], error=msg) raise RuntimeError(msg) try: ansible_hosts = modify_ansible_hosts_file(cluster_name_suffix_id, list_of_hosts=list_of_new_slaves, master_ip=master_ip, action='add_slaves') state = 'Configuring Hadoop for new nodes of %s ' % cluster_to_scale.cluster_name set_cluster_state(token, cluster_id, state) ansible_scale_cluster(ansible_hosts, new_slaves_size=len(list_of_new_slaves), orka_image_uuid=image_id, user_id=user_id) modify_ansible_hosts_file(cluster_name_suffix_id, action='join_slaves') except Exception, e: msg = '{0}. Scale action failed. Cluster rolled back'.format(str(e.args[0])) set_cluster_state(token, cluster_id, msg) rollback_scale_cluster(list_of_new_slaves, cyclades, cluster_to_scale, pre_scale_size,ansible=True) set_cluster_state(token, cluster_id, state=msg, status=status_map[previous_cluster_status], error=msg) raise RuntimeError(msg) finally: subprocess.call('rm -rf /tmp/{0}'.format(user_id),shell=True)