def _check_swift_availability(self, cluster_info): plugin_config = cluster_info['plugin_config'] # Make unique name of Swift container during Swift testing swift_container_name = 'Swift-test-' + str(uuid.uuid4())[:8] extra_script_parameters = { 'OS_TENANT_NAME': self.common_config.OS_TENANT_NAME, 'OS_USERNAME': self.common_config.OS_USERNAME, 'OS_PASSWORD': self.common_config.OS_PASSWORD, 'HADOOP_USER': plugin_config.HADOOP_USER, 'SWIFT_CONTAINER_NAME': swift_container_name } namenode_ip = cluster_info['node_info']['namenode_ip'] self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME) try: self.transfer_helper_script_to_node( 'swift_test_script.sh', parameter_list=extra_script_parameters ) except Exception as e: with excutils.save_and_reraise_exception(): print(str(e)) swift = self.connect_to_swift() swift.put_container(swift_container_name) try: self.execute_command('./script.sh') except Exception as e: with excutils.save_and_reraise_exception(): print(str(e)) finally: self.delete_swift_container(swift, swift_container_name) self.close_ssh_connection()
def consume_in_thread(self): """Runs the ZmqProxy service.""" ipc_dir = CONF.rpc_zmq_ipc_dir consume_in = "tcp://%s:%s" % \ (CONF.rpc_zmq_bind_address, CONF.rpc_zmq_port) consumption_proxy = InternalContext(None) try: os.makedirs(ipc_dir) except os.error: if not os.path.isdir(ipc_dir): with excutils.save_and_reraise_exception(): LOG.error(_("Required IPC directory does not exist at" " %s") % (ipc_dir, )) try: self.register(consumption_proxy, consume_in, zmq.PULL) except zmq.ZMQError: if os.access(ipc_dir, os.X_OK): with excutils.save_and_reraise_exception(): LOG.error(_("Permission denied to IPC directory at" " %s") % (ipc_dir, )) with excutils.save_and_reraise_exception(): LOG.error(_("Could not create ZeroMQ receiver daemon. " "Socket may already be in use.")) super(ZmqProxy, self).consume_in_thread()
def consume_in_thread(self): """Runs the ZmqProxy service.""" ipc_dir = CONF.rpc_zmq_ipc_dir consume_in = "tcp://%s:%s" % \ (CONF.rpc_zmq_bind_address, CONF.rpc_zmq_port) consumption_proxy = InternalContext(None) try: os.makedirs(ipc_dir) except os.error: if not os.path.isdir(ipc_dir): with excutils.save_and_reraise_exception(): LOG.error( _("Required IPC directory does not exist at" " %s") % (ipc_dir, )) try: self.register(consumption_proxy, consume_in, zmq.PULL) except zmq.ZMQError: if os.access(ipc_dir, os.X_OK): with excutils.save_and_reraise_exception(): LOG.error( _("Permission denied to IPC directory at" " %s") % (ipc_dir, )) with excutils.save_and_reraise_exception(): LOG.error( _("Could not create ZeroMQ receiver daemon. " "Socket may already be in use.")) super(ZmqProxy, self).consume_in_thread()
def check_swift_availability(self, cluster_info): plugin_config = cluster_info['plugin_config'] # Make unique name of Swift container during Swift testing swift_container_name = 'Swift-test-' + str(uuid.uuid4())[:8] extra_script_parameters = { 'OS_TENANT_NAME': self.common_config.OS_TENANT_NAME, 'OS_USERNAME': self.common_config.OS_USERNAME, 'OS_PASSWORD': self.common_config.OS_PASSWORD, 'HADOOP_USER': plugin_config.HADOOP_USER, 'SWIFT_CONTAINER_NAME': swift_container_name } namenode_ip = cluster_info['node_info']['namenode_ip'] self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME) try: self.transfer_helper_script_to_node( 'swift_test_script.sh', parameter_list=extra_script_parameters) except Exception as e: with excutils.save_and_reraise_exception(): print(str(e)) swift = self.connect_to_swift() swift.put_container(swift_container_name) try: self.execute_command('./script.sh') except Exception as e: with excutils.save_and_reraise_exception(): print(str(e)) finally: self.delete_swift_container(swift, swift_container_name) self.close_ssh_connection()
def create_cluster_and_get_info(self, plugin_config, cluster_template_id, description, cluster_configs, node_groups=None, anti_affinity=None, net_id=None, is_transient=False): self.cluster_id = None data = self.savanna.clusters.create( self.common_config.CLUSTER_NAME + '-' + plugin_config.PLUGIN_NAME, plugin_config.PLUGIN_NAME, plugin_config.HADOOP_VERSION, cluster_template_id, plugin_config.IMAGE_ID, is_transient, description, cluster_configs, node_groups, self.common_config.USER_KEYPAIR_ID, anti_affinity, net_id) self.cluster_id = data.id self.poll_cluster_state(self.cluster_id) node_ip_list_with_node_processes = ( self.get_cluster_node_ip_list_with_node_processes(self.cluster_id)) try: node_info = self.get_node_info(node_ip_list_with_node_processes, plugin_config) except Exception as e: with excutils.save_and_reraise_exception(): print( '\nFailure during check of node process deployment ' 'on cluster node: ' + str(e) ) try: self.await_active_workers_for_namenode(node_info, plugin_config) except Exception as e: with excutils.save_and_reraise_exception(): print( '\nFailure while active worker waiting for namenode: ' + str(e) ) # For example: method "create_cluster_and_get_info" return # { # 'node_info': { # 'tasktracker_count': 3, # 'node_count': 6, # 'namenode_ip': '172.18.168.242', # 'datanode_count': 3 # }, # 'cluster_id': 'bee5c6a1-411a-4e88-95fc-d1fbdff2bb9d', # 'node_ip_list': { # '172.18.168.153': ['tasktracker', 'datanode'], # '172.18.168.208': ['secondarynamenode', 'oozie'], # '172.18.168.93': ['tasktracker'], # '172.18.168.101': ['tasktracker', 'datanode'], # '172.18.168.242': ['namenode', 'jobtracker'], # '172.18.168.167': ['datanode'] # }, # 'plugin_config': <oslo.config.cfg.GroupAttr object at 0x215d9d> # } return { 'cluster_id': self.cluster_id, 'node_ip_list': node_ip_list_with_node_processes, 'node_info': node_info, 'plugin_config': plugin_config }
def create_cluster_and_get_info(self, plugin_config, cluster_template_id, description, cluster_configs, node_groups=None, anti_affinity=None, net_id=None, is_transient=False): self.cluster_id = None data = self.savanna.clusters.create( self.common_config.CLUSTER_NAME + '-' + plugin_config.PLUGIN_NAME, plugin_config.PLUGIN_NAME, plugin_config.HADOOP_VERSION, cluster_template_id, plugin_config.IMAGE_ID, is_transient, description, cluster_configs, node_groups, self.common_config.USER_KEYPAIR_ID, anti_affinity, net_id) self.cluster_id = data.id self.poll_cluster_state(self.cluster_id) node_ip_list_with_node_processes = ( self.get_cluster_node_ip_list_with_node_processes(self.cluster_id)) try: node_info = self.get_node_info(node_ip_list_with_node_processes, plugin_config) except Exception as e: with excutils.save_and_reraise_exception(): print( '\nFailure during check of node process deployment ' 'on cluster node: ' + str(e) ) try: self.await_active_workers_for_namenode(node_info, plugin_config) except Exception as e: with excutils.save_and_reraise_exception(): print( '\nFailure while active worker waiting for namenode: ' + str(e) ) # For example: method "create_cluster_and_get_info" return # { # 'node_info': { # 'tasktracker_count': 3, # 'node_count': 6, # 'namenode_ip': '172.18.168.242', # 'datanode_count': 3 # }, # 'cluster_id': 'bee5c6a1-411a-4e88-95fc-d1fbdff2bb9d', # 'node_ip_list': { # '172.18.168.153': ['tasktracker', 'datanode'], # '172.18.168.208': ['secondarynamenode', 'oozie'], # '172.18.168.93': ['tasktracker'], # '172.18.168.101': ['tasktracker', 'datanode'], # '172.18.168.242': ['namenode', 'jobtracker'], # '172.18.168.167': ['datanode'] # }, # 'plugin_config': <oslo.config.cfg.GroupAttr object at 0x215d9d> # } return { 'cluster_id': self.cluster_id, 'node_ip_list': node_ip_list_with_node_processes, 'node_info': node_info, 'plugin_config': plugin_config }
def _map_reduce_testing(self, cluster_info): plugin_config = cluster_info['plugin_config'] node_count = cluster_info['node_info']['node_count'] extra_script_parameters = { 'HADOOP_VERSION': plugin_config.HADOOP_VERSION, 'HADOOP_DIRECTORY': plugin_config.HADOOP_DIRECTORY, 'HADOOP_LOG_DIRECTORY': plugin_config.HADOOP_LOG_DIRECTORY, 'HADOOP_USER': plugin_config.HADOOP_USER, 'NODE_COUNT': node_count, 'PLUGIN_NAME': plugin_config.PLUGIN_NAME } node_ip_and_process_list = cluster_info['node_ip_list'] try: self.transfer_helper_script_to_nodes( node_ip_and_process_list, plugin_config.SSH_USERNAME, 'map_reduce_test_script.sh', parameter_list=extra_script_parameters ) except Exception as e: with excutils.save_and_reraise_exception(): print(str(e)) namenode_ip = cluster_info['node_info']['namenode_ip'] self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME) self.__run_pi_job() job_name = self.__get_name_of_completed_pi_job() self.close_ssh_connection() # Check that cluster used each "tasktracker" node while work of PI-job. # Count of map-tasks and reduce-tasks in helper script guarantees that # cluster will use each from such nodes while work of PI-job. try: for node_ip, process_list in node_ip_and_process_list.items(): if plugin_config.PROCESS_NAMES['tt'] in process_list: self.open_ssh_connection( node_ip, plugin_config.SSH_USERNAME ) self.execute_command( './script.sh check_directory -job_name %s' % job_name ) self.close_ssh_connection() except Exception as e: with excutils.save_and_reraise_exception(): print( '\nLog file of completed \'PI\' job on \'tasktracker\' ' 'cluster node not found: ' + str(e) ) self.close_ssh_connection() self.open_ssh_connection( namenode_ip, plugin_config.SSH_USERNAME ) self.capture_error_log_from_cluster_node( '/tmp/MapReduceTestOutput/log.txt' ) self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME) self.__run_wordcount_job() self.close_ssh_connection()
def cluster_scaling(self, cluster_info, change_list): scale_body = {'add_node_groups': [], 'resize_node_groups': []} for change in change_list: if change['operation'] == 'resize': node_group_name = change['info'][0] node_group_size = change['info'][1] self._add_new_field_to_scale_body_while_ng_resizing( scale_body, node_group_name, node_group_size) self._change_node_info_while_ng_resizing( node_group_name, node_group_size, cluster_info) if change['operation'] == 'add': node_group_name = change['info'][0] node_group_size = change['info'][1] node_group_id = change['info'][2] self._add_new_field_to_scale_body_while_ng_adding( scale_body, node_group_id, node_group_size, node_group_name) self._change_node_info_while_ng_adding(node_group_id, node_group_size, cluster_info) self.savanna.clusters.scale(cluster_info['cluster_id'], scale_body) self.poll_cluster_state(cluster_info['cluster_id']) new_node_ip_list = self.get_cluster_node_ip_list_with_node_processes( cluster_info['cluster_id']) try: new_node_info = self.get_node_info(new_node_ip_list, cluster_info['plugin_config']) except Exception as e: with excutils.save_and_reraise_exception(): print('\nFailure during check of node process deployment ' 'on cluster node: ' + str(e)) expected_node_info = cluster_info['node_info'] self.assertEqual( expected_node_info, new_node_info, 'Failure while node info comparison.\n' 'Expected node info after cluster scaling: %s.\n' 'Actual node info after cluster scaling: %s.' % (expected_node_info, new_node_info)) try: self.await_active_workers_for_namenode( new_node_info, cluster_info['plugin_config']) except Exception as e: with excutils.save_and_reraise_exception(): print('\nFailure while active worker waiting for namenode: ' + str(e)) return { 'cluster_id': cluster_info['cluster_id'], 'node_ip_list': new_node_ip_list, 'node_info': new_node_info, 'plugin_config': cluster_info['plugin_config'] }
def _check_swift_availability(self, cluster_info): plugin_config = cluster_info["plugin_config"] # Make unique name of Swift container during Swift testing swift_container_name = "Swift-test-" + str(uuid.uuid4()) extra_script_parameters = { "OS_TENANT_NAME": self.common_config.OS_TENANT_NAME, "OS_USERNAME": self.common_config.OS_USERNAME, "OS_PASSWORD": self.common_config.OS_PASSWORD, "HADOOP_USER": plugin_config.HADOOP_USER, "SWIFT_CONTAINER_NAME": swift_container_name, } namenode_ip = cluster_info["node_info"]["namenode_ip"] self.open_ssh_connection(namenode_ip, plugin_config.NODE_USERNAME) try: self.transfer_helper_script_to_node("swift_test_script.sh", parameter_list=extra_script_parameters) except Exception as e: with excutils.save_and_reraise_exception(): print(str(e)) swift = self.connect_to_swift() swift.put_container(swift_container_name) try: self.execute_command("./script.sh") except Exception as e: with excutils.save_and_reraise_exception(): print(str(e)) finally: self.delete_swift_container(swift, swift_container_name) self.close_ssh_connection()
def create_cluster(values): ctx = context.ctx() cluster = conductor.cluster_create(ctx, values) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) # validating cluster try: cluster = conductor.cluster_update(ctx, cluster, {"status": "Validating"}) LOG.info(g.format_cluster_status(cluster)) plugin.validate(cluster) except Exception as e: with excutils.save_and_reraise_exception(): cluster = conductor.cluster_update(ctx, cluster, {"status": "Error", "status_description": str(e)}) LOG.info(g.format_cluster_status(cluster)) context.spawn("cluster-creating-%s" % cluster.id, _provision_cluster, cluster.id) if CONF.use_identity_api_v3 and cluster.is_transient: trusts.create_trust(cluster) return conductor.cluster_get(ctx, cluster.id)
def _add_params_to_script_and_transfer_to_node(self, cluster_info, node_group, node_with_volumes=False): plugin_config = cluster_info['plugin_config'] hadoop_log_directory = plugin_config.HADOOP_LOG_DIRECTORY if node_with_volumes: hadoop_log_directory = ( plugin_config.HADOOP_LOG_DIRECTORY_ON_VOLUME) extra_script_parameters = { 'HADOOP_VERSION': plugin_config.HADOOP_VERSION, 'HADOOP_DIRECTORY': plugin_config.HADOOP_DIRECTORY, 'HADOOP_LOG_DIRECTORY': hadoop_log_directory, 'HADOOP_USER': plugin_config.HADOOP_USER, 'NODE_COUNT': cluster_info['node_info']['node_count'], 'PLUGIN_NAME': plugin_config.PLUGIN_NAME } for instance in node_group['instances']: try: self.open_ssh_connection(instance['management_ip'], plugin_config.SSH_USERNAME) self.transfer_helper_script_to_node( 'map_reduce_test_script.sh', extra_script_parameters) self.close_ssh_connection() except Exception as e: with excutils.save_and_reraise_exception(): print(str(e))
def transfer_helper_script_to_node(self, script_name, parameter_list=None): script = open('savanna/tests/integration/tests/resources/%s' % script_name).read() if parameter_list: for parameter, value in parameter_list.items(): script = script.replace( '%s=""' % parameter, '%s=%s' % (parameter, value)) try: self.write_file_to('script.sh', script) except Exception as e: with excutils.save_and_reraise_exception(): print( '\nFailure while helper script transferring ' 'to cluster node: ' + str(e) ) self.execute_command('chmod 777 script.sh')
def map_reduce_testing(self, cluster_info): self._transfer_helper_script_to_nodes(cluster_info) plugin_config = cluster_info['plugin_config'] namenode_ip = cluster_info['node_info']['namenode_ip'] self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME) self._run_pi_job() job_name = self._get_name_of_completed_pi_job() self.close_ssh_connection() # Check that cluster used each "tasktracker" node while work of PI-job. # Count of map-tasks and reduce-tasks in helper script guarantees that # cluster will use each from such nodes while work of PI-job. node_ip_and_process_list = cluster_info['node_ip_list'] try: for node_ip, process_list in node_ip_and_process_list.items(): if plugin_config.PROCESS_NAMES['tt'] in process_list: self.open_ssh_connection(node_ip, plugin_config.SSH_USERNAME) self.execute_command( './script.sh check_directory -job_name %s' % job_name) self.close_ssh_connection() except Exception as e: with excutils.save_and_reraise_exception(): print('\nLog file of completed \'PI\' job on \'tasktracker\' ' 'cluster node not found: ' + str(e)) self.close_ssh_connection() self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME) self.capture_error_log_from_cluster_node( '/tmp/MapReduceTestOutput/log.txt') self.open_ssh_connection(namenode_ip, plugin_config.SSH_USERNAME) self._run_wordcount_job() self.close_ssh_connection()
def scale_cluster(cluster_id, data): cluster = get_cluster(id=cluster_id) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) existing_node_groups = data.get("resize_node_groups", []) additional_node_groups = data.get("add_node_groups", []) # the next map is the main object we will work with # to_be_enlarged : {node_group_name: desired_amount_of_instances} to_be_enlarged = {} for ng in existing_node_groups: to_be_enlarged.update({ng["name"]: ng["count"]}) additional = construct_ngs_for_scaling(additional_node_groups) try: context.model_update(cluster, status="Validating") plugin.validate_scaling(cluster, to_be_enlarged, additional) except Exception: with excutils.save_and_reraise_exception(): context.model_update(cluster, status="Active") # If we are here validation is successful. # So let's update bd and to_be_enlarged map: for add_n_g in additional: cluster.node_groups.append(add_n_g) to_be_enlarged.update({add_n_g.name: additional[add_n_g]}) context.model_save(cluster) context.spawn(_provision_nodes, cluster_id, to_be_enlarged) return cluster
def create_cluster(cluster): ctx = context.ctx() try: # create all instances conductor.cluster_update(ctx, cluster, {"status": "Spawning"}) LOG.info(g.format_cluster_status(cluster)) _create_instances(cluster) # wait for all instances are up and accessible cluster = conductor.cluster_update(ctx, cluster, {"status": "Waiting"}) LOG.info(g.format_cluster_status(cluster)) cluster = _await_instances(cluster) # attach volumes volumes.attach(cluster) # prepare all instances cluster = conductor.cluster_update(ctx, cluster, {"status": "Preparing"}) LOG.info(g.format_cluster_status(cluster)) _configure_instances(cluster) except Exception as ex: LOG.warn("Can't start cluster '%s' (reason: %s)", cluster.name, ex) with excutils.save_and_reraise_exception(): cluster = conductor.cluster_update(ctx, cluster, {"status": "Error", "status_description": str(ex)}) LOG.info(g.format_cluster_status(cluster)) _rollback_cluster_creation(cluster, ex)
def create_cluster(cluster): ctx = context.ctx() try: # create all instances conductor.cluster_update(ctx, cluster, {"status": "Spawning"}) LOG.info(g.format_cluster_status(cluster)) _create_instances(cluster) # wait for all instances are up and accessible cluster = conductor.cluster_update(ctx, cluster, {"status": "Waiting"}) LOG.info(g.format_cluster_status(cluster)) cluster = _await_instances(cluster) # attach volumes volumes.attach(cluster) # prepare all instances cluster = conductor.cluster_update(ctx, cluster, {"status": "Preparing"}) LOG.info(g.format_cluster_status(cluster)) _configure_instances(cluster) except Exception as ex: LOG.warn("Can't start cluster '%s' (reason: %s)", cluster.name, ex) with excutils.save_and_reraise_exception(): cluster = conductor.cluster_update(ctx, cluster, { "status": "Error", "status_description": str(ex) }) LOG.info(g.format_cluster_status(cluster)) _rollback_cluster_creation(cluster, ex)
def _scale_cluster(cluster, target_count): ctx = context.ctx() rollback_count = _get_ng_counts(cluster) launcher = _ScaleLauncher() try: launcher.launch_instances(ctx, cluster, target_count) except Exception as ex: LOG.warn("Can't scale cluster '%s' (reason: %s)", cluster.name, ex) with excutils.save_and_reraise_exception(): cluster = conductor.cluster_get(ctx, cluster) try: _rollback_cluster_scaling(ctx, cluster, rollback_count, target_count) except Exception: # if something fails during the rollback, we stop # doing anything further cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) LOG.info(g.format_cluster_status(cluster)) LOG.error("Unable to complete rollback, aborting") raise cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) LOG.warn("Rollback successful. Throwing off an initial exception.") finally: cluster = conductor.cluster_get(ctx, cluster) _clean_cluster_from_empty_ng(cluster) return launcher.inst_ids
def scale_cluster(cluster, node_group_names_map, plugin): # Now let's work with real node_groups, not names: node_groups_map = {} for ng in cluster.node_groups: if ng.name in node_group_names_map: node_groups_map.update({ng: node_group_names_map[ng.name]}) instances_list = [] try: instances_list = _scale_cluster_instances( cluster, node_groups_map, plugin) _clean_cluster_from_empty_ng(cluster) _await_instances(cluster) volumes.attach_to_instances(instances_list) except Exception as ex: LOG.warn("Can't scale cluster '%s' (reason: %s)", cluster.name, ex) with excutils.save_and_reraise_exception(): _rollback_cluster_scaling(cluster, instances_list, ex) instances_list = [] _clean_cluster_from_empty_ng(cluster) if cluster.status == 'Decommissioning': context.model_update(cluster, status='Error') else: context.model_update(cluster, status='Active') # we should be here with valid cluster: if instances creation # was not successful all extra-instances will be removed above if instances_list: _configure_instances(cluster) return instances_list
def scale_cluster(id, data): ctx = context.ctx() cluster = conductor.cluster_get(ctx, id) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) existing_node_groups = data.get("resize_node_groups", []) additional_node_groups = data.get("add_node_groups", []) # the next map is the main object we will work with # to_be_enlarged : {node_group_id: desired_amount_of_instances} to_be_enlarged = {} for ng in existing_node_groups: ng_id = g.find(cluster.node_groups, name=ng["name"])["id"] to_be_enlarged.update({ng_id: ng["count"]}) additional = construct_ngs_for_scaling(cluster, additional_node_groups) try: cluster = conductor.cluster_update(ctx, cluster, {"status": "Validating"}) LOG.info(g.format_cluster_status(cluster)) plugin.validate_scaling(cluster, to_be_enlarged, additional) except Exception: with excutils.save_and_reraise_exception(): i.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) # If we are here validation is successful. # So let's update to_be_enlarged map: to_be_enlarged.update(additional) context.spawn("cluster-scaling-%s" % id, _provision_nodes, id, to_be_enlarged) return conductor.cluster_get(ctx, id)
def _cluster_config_testing(self, cluster_info): cluster_id = cluster_info['cluster_id'] data = self.savanna.clusters.get(cluster_id) self.__compare_configs( {'Enable Swift': True}, data.cluster_configs['general'] ) self.__compare_configs( CLUSTER_HDFS_CONFIG, data.cluster_configs['HDFS'] ) self.__compare_configs( CLUSTER_MR_CONFIG, data.cluster_configs['MapReduce'] ) node_groups = data.node_groups self.__check_configs_for_node_groups(node_groups) node_ip_list_with_node_processes = ( self.get_cluster_node_ip_list_with_node_processes(cluster_id)) try: self.transfer_helper_script_to_nodes( node_ip_list_with_node_processes, self.vanilla_config.SSH_USERNAME, 'cluster_config_test_script.sh' ) except Exception as e: with excutils.save_and_reraise_exception(): print(str(e)) self.__check_config_application_on_cluster_nodes( node_ip_list_with_node_processes )
def __enter__(self): _acquire_remote_semaphore() try: self.bulk = BulkInstanceInteropHelper(self.instance) return self.bulk except Exception: with excutils.save_and_reraise_exception(): _release_remote_semaphore()
def __enter__(self): _acquire_remote_semaphore() try: self.bulk = BulkInstanceInteropHelper(self.instance) return self.bulk except Exception: with excutils.save_and_reraise_exception(): _release_remote_semaphore()
def __init__(self, instance): super(BulkInstanceInteropHelper, self).__init__(instance) self.proc = procutils.start_subprocess() try: procutils.run_in_subprocess(self.proc, _connect, self._get_conn_params()) except Exception: with excutils.save_and_reraise_exception(): procutils.shutdown_subprocess(self.proc, _cleanup)
def _run_wordcount_job(self): try: self.execute_command('./script.sh run_wordcount_job') except Exception as e: with excutils.save_and_reraise_exception(): print('\nFailure while \'Wordcount\' job launch: ' + str(e)) self.capture_error_log_from_cluster_node( '/tmp/MapReduceTestOutput/log.txt')
def __init__(self, instance): super(BulkInstanceInteropHelper, self).__init__(instance) self.proc = procutils.start_subprocess() try: procutils.run_in_subprocess(self.proc, _connect, self._get_conn_params()) except Exception: with excutils.save_and_reraise_exception(): procutils.shutdown_subprocess(self.proc, _cleanup)
def __init__(self, instance, username): self.instance = instance self.username = username self.proc = procutils.start_subprocess() try: procutils.run_in_subprocess(self.proc, _connect, self._get_conn_params()) except Exception: with excutils.save_and_reraise_exception(): procutils.shutdown_subprocess(self.proc, _cleanup)
def __init__(self, instance, username): self.instance = instance self.username = username self.proc = procutils.start_subprocess() try: procutils.run_in_subprocess(self.proc, _connect, self._get_conn_params()) except Exception: with excutils.save_and_reraise_exception(): procutils.shutdown_subprocess(self.proc, _cleanup)
def __run_wordcount_job(self): try: self.execute_command('./script.sh run_wordcount_job') except Exception as e: with excutils.save_and_reraise_exception(): print('\nFailure while \'Wordcount\' job launch: ' + str(e)) self.capture_error_log_from_cluster_node( '/tmp/MapReduceTestOutput/log.txt' )
def try_telnet(self, host, port): try: telnetlib.Telnet(host, port) except Exception as e: with excutils.save_and_reraise_exception(): print( '\nTelnet has failed: ' + str(e) + ' NODE IP: %s, PORT: %s. Passed %s minute(s).' % (host, port, self.common_config.TELNET_TIMEOUT) )
def remove_path_on_error(path): """Protect code that wants to operate on PATH atomically. Any exception will cause PATH to be removed. :param path: File to work with """ try: yield except Exception: with excutils.save_and_reraise_exception(): delete_if_exists(path)
def _get_name_of_completed_pi_job(self): try: job_name = self.execute_command('./script.sh get_pi_job_name') except Exception as e: with excutils.save_and_reraise_exception(): print('\nFailure while name obtaining completed \'PI\' job: ' + str(e)) self.capture_error_log_from_cluster_node( '/tmp/MapReduceTestOutput/log.txt') return job_name[1][:-1]
def try_get_image_id_and_savanna_username(parameter, value): try: return image.id, image.metadata['_savanna_username'] except KeyError: with excutils.save_and_reraise_exception(): print_error_log(parameter, value)
def try_get_image_id_and_ssh_username(parameter, value): try: if not plugin_config.SSH_USERNAME: return image.id, image.metadata['_savanna_username'] else: return image.id, plugin_config.SSH_USERNAME except KeyError: with excutils.save_and_reraise_exception(): print_error_log(parameter, value)
def try_get_image_id_and_ssh_username(parameter, value): try: if not plugin_config.SSH_USERNAME: return image.id, image.metadata['_savanna_username'] else: return image.id, plugin_config.SSH_USERNAME except KeyError: with excutils.save_and_reraise_exception(): print_error_log(parameter, value)
def _serialize(data): """Serialization wrapper. We prefer using JSON, but it cannot encode all types. Error if a developer passes us bad data. """ try: return jsonutils.dumps(data, ensure_ascii=True) except TypeError: with excutils.save_and_reraise_exception(): LOG.error(_("JSON serialization failed."))
def _compare_configs_on_cluster_node(self, config, value): config = config.replace(' ', '') try: self.execute_command('./script.sh %s -value %s' % (config, value)) except Exception as e: with excutils.save_and_reraise_exception(): print('\nFailure while config comparison on cluster node: ' + str(e)) self.capture_error_log_from_cluster_node( '/tmp/config-test-log.txt')
def remove_path_on_error(path): """Protect code that wants to operate on PATH atomically. Any exception will cause PATH to be removed. :param path: File to work with """ try: yield except Exception: with excutils.save_and_reraise_exception(): delete_if_exists(path)
def _serialize(data): """Serialization wrapper. We prefer using JSON, but it cannot encode all types. Error if a developer passes us bad data. """ try: return jsonutils.dumps(data, ensure_ascii=True) except TypeError: with excutils.save_and_reraise_exception(): LOG.error(_("JSON serialization failed."))
def try_telnet(self, host, port): try: telnetlib.Telnet(host, port) except Exception as e: with excutils.save_and_reraise_exception(): print( '\nTelnet has failed: ' + str(e) + ' NODE IP: %s, PORT: %s. Passed %s minute(s).' % (host, port, self.common_config.TELNET_TIMEOUT) )
def _run(self, func, *args, **kwargs): proc = procutils.start_subprocess() try: procutils.run_in_subprocess(proc, _connect, self._get_conn_params()) return procutils.run_in_subprocess(proc, func, args, kwargs) except Exception: with excutils.save_and_reraise_exception(): procutils.shutdown_subprocess(proc, _cleanup) finally: procutils.shutdown_subprocess(proc, _cleanup)
def _run(self, func, *args, **kwargs): proc = procutils.start_subprocess() try: procutils.run_in_subprocess(proc, _connect, self._get_conn_params()) return procutils.run_in_subprocess(proc, func, args, kwargs) except Exception: with excutils.save_and_reraise_exception(): procutils.shutdown_subprocess(proc, _cleanup) finally: procutils.shutdown_subprocess(proc, _cleanup)
def scale_cluster(self, cluster, node_group_id_map): ctx = context.ctx() instance_ids = [] try: instance_ids = self._scale_cluster_instances(cluster, node_group_id_map) cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_get(ctx, cluster) instances = g.get_instances(cluster, instance_ids) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) volumes.attach_to_instances( g.get_instances(cluster, instance_ids)) except Exception as ex: with excutils.save_and_reraise_exception(): self._log_operation_exception( "Can't scale cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_get(ctx, cluster) self._rollback_cluster_scaling( cluster, g.get_instances(cluster, instance_ids), ex) instance_ids = [] cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) if cluster.status == 'Decommissioning': cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) else: cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) # we should be here with valid cluster: if instances creation # was not successful all extra-instances will be removed above if instance_ids: self._configure_instances(cluster) return instance_ids
def scale_cluster(self, cluster, node_group_id_map): ctx = context.ctx() instance_ids = [] try: instance_ids = self._scale_cluster_instances( cluster, node_group_id_map) cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_get(ctx, cluster) instances = g.get_instances(cluster, instance_ids) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) volumes.attach_to_instances(g.get_instances(cluster, instance_ids)) except Exception as ex: with excutils.save_and_reraise_exception(): self._log_operation_exception( "Can't scale cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_get(ctx, cluster) self._rollback_cluster_scaling( cluster, g.get_instances(cluster, instance_ids), ex) instance_ids = [] cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) if cluster.status == 'Decommissioning': cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) else: cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) # we should be here with valid cluster: if instances creation # was not successful all extra-instances will be removed above if instance_ids: self._configure_instances(cluster) return instance_ids
def __compare_configs_on_cluster_node(self, config, value): config = config.replace(' ', '') try: self.execute_command('./script.sh %s -value %s' % (config, value)) except Exception as e: with excutils.save_and_reraise_exception(): print( '\nFailure while config comparison on cluster node: ' + str(e) ) self.capture_error_log_from_cluster_node( '/tmp/config-test-log.txt' )
def __get_name_of_completed_pi_job(self): try: job_name = self.execute_command('./script.sh get_pi_job_name') except Exception as e: with excutils.save_and_reraise_exception(): print( '\nFailure while name obtaining completed \'PI\' job: ' + str(e) ) self.capture_error_log_from_cluster_node( '/tmp/MapReduceTestOutput/log.txt' ) return job_name[1][:-1]
def scale_cluster(id, data): ctx = context.ctx() cluster = conductor.cluster_get(ctx, id) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) existing_node_groups = data.get('resize_node_groups', []) additional_node_groups = data.get('add_node_groups', []) #the next map is the main object we will work with #to_be_enlarged : {node_group_id: desired_amount_of_instances} to_be_enlarged = {} for ng in existing_node_groups: ng_id = g.find(cluster.node_groups, name=ng['name'])['id'] to_be_enlarged.update({ng_id: ng['count']}) additional = construct_ngs_for_scaling(cluster, additional_node_groups) cluster = conductor.cluster_get(ctx, cluster) # update nodegroup image usernames for nodegroup in cluster.node_groups: if additional.get(nodegroup.id): image_username = INFRA.get_node_group_image_username(nodegroup) conductor.node_group_update(ctx, nodegroup, {"image_username": image_username}) cluster = conductor.cluster_get(ctx, cluster) try: cluster = conductor.cluster_update(ctx, cluster, {"status": "Validating"}) LOG.info(g.format_cluster_status(cluster)) plugin.validate_scaling(cluster, to_be_enlarged, additional) except Exception: with excutils.save_and_reraise_exception(): g.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) # If we are here validation is successful. # So let's update to_be_enlarged map: to_be_enlarged.update(additional) for node_group in cluster.node_groups: if node_group.id not in to_be_enlarged: to_be_enlarged[node_group.id] = node_group.count context.spawn("cluster-scaling-%s" % id, _provision_scaled_cluster, id, to_be_enlarged) return conductor.cluster_get(ctx, id)
def scale_cluster(id, data): ctx = context.ctx() cluster = conductor.cluster_get(ctx, id) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) existing_node_groups = data.get('resize_node_groups', []) additional_node_groups = data.get('add_node_groups', []) #the next map is the main object we will work with #to_be_enlarged : {node_group_id: desired_amount_of_instances} to_be_enlarged = {} for ng in existing_node_groups: ng_id = g.find(cluster.node_groups, name=ng['name'])['id'] to_be_enlarged.update({ng_id: ng['count']}) additional = construct_ngs_for_scaling(cluster, additional_node_groups) cluster = conductor.cluster_get(ctx, cluster) # update nodegroup image usernames for nodegroup in cluster.node_groups: if additional.get(nodegroup.id): image_username = INFRA.get_node_group_image_username(nodegroup) conductor.node_group_update( ctx, nodegroup, {"image_username": image_username}) cluster = conductor.cluster_get(ctx, cluster) try: cluster = conductor.cluster_update(ctx, cluster, {"status": "Validating"}) LOG.info(g.format_cluster_status(cluster)) plugin.validate_scaling(cluster, to_be_enlarged, additional) except Exception: with excutils.save_and_reraise_exception(): INFRA.clean_cluster_from_empty_ng(cluster) cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) # If we are here validation is successful. # So let's update to_be_enlarged map: to_be_enlarged.update(additional) for node_group in cluster.node_groups: if node_group.id not in to_be_enlarged: to_be_enlarged[node_group.id] = node_group.count context.spawn("cluster-scaling-%s" % id, _provision_scaled_cluster, id, to_be_enlarged) return conductor.cluster_get(ctx, id)
def create_cluster(values): cluster = s.create_cluster(values) plugin = plugin_base.PLUGINS.get_plugin(cluster.plugin_name) # validating cluster try: context.model_update(cluster, status="Validating") plugin.validate(cluster) except Exception as ex: with excutils.save_and_reraise_exception(): context.model_update(cluster, status="Error", status_description=str(ex)) context.spawn(_provision_cluster, cluster.id) return cluster
def get_floating_ip_pool_id_for_neutron_net(self): # Find corresponding floating IP pool by its name and get its ID. # If pool not found then handle error try: floating_ip_pool = self.neutron.list_networks( name=self.common_config.FLOATING_IP_POOL) floating_ip_pool_id = floating_ip_pool['networks'][0]['id'] return floating_ip_pool_id except IndexError: with excutils.save_and_reraise_exception(): raise Exception( '\nFloating IP pool \'%s\' not found in pool list. ' 'Please, make sure you specified right floating IP pool.' % self.common_config.FLOATING_IP_POOL )
def get_internal_neutron_net_id(self): # Find corresponding internal Neutron network by its name and get # its ID. If network not found then handle error try: internal_neutron_net = self.neutron.list_networks( name=self.common_config.INTERNAL_NEUTRON_NETWORK) internal_neutron_net_id = internal_neutron_net['networks'][0]['id'] return internal_neutron_net_id except IndexError: with excutils.save_and_reraise_exception(): raise Exception( '\nInternal Neutron network \'%s\' not found in network ' 'list. Please, make sure you specified right network name.' % self.common_config.INTERNAL_NEUTRON_NETWORK )
def get_floating_ip_pool_id_for_neutron_net(self): # Find corresponding floating IP pool by its name and get its ID. # If pool not found then handle error try: floating_ip_pool = self.neutron.list_networks( name=self.common_config.FLOATING_IP_POOL) floating_ip_pool_id = floating_ip_pool['networks'][0]['id'] return floating_ip_pool_id except IndexError: with excutils.save_and_reraise_exception(): raise Exception( '\nFloating IP pool \'%s\' not found in pool list. ' 'Please, make sure you specified right floating IP pool.' % self.common_config.FLOATING_IP_POOL )
def get_internal_neutron_net_id(self): # Find corresponding internal Neutron network by its name and get # its ID. If network not found then handle error try: internal_neutron_net = self.neutron.list_networks( name=self.common_config.INTERNAL_NEUTRON_NETWORK) internal_neutron_net_id = internal_neutron_net['networks'][0]['id'] return internal_neutron_net_id except IndexError: with excutils.save_and_reraise_exception(): raise Exception( '\nInternal Neutron network \'%s\' not found in network ' 'list. Please, make sure you specified right network name.' % self.common_config.INTERNAL_NEUTRON_NETWORK )
def transfer_helper_script_to_node(self, script_name, parameter_list=None): script = open('savanna/tests/integration/tests/resources/%s' % script_name).read() if parameter_list: for parameter, value in parameter_list.items(): script = script.replace( '%s=""' % parameter, '%s=%s' % (parameter, value)) try: self.write_file_to('script.sh', script) except Exception as e: with excutils.save_and_reraise_exception(): print( '\nFailure while helper script transferring ' 'to cluster node: ' + str(e) ) self.execute_command('chmod 777 script.sh')
def create_cluster(self, cluster): ctx = context.ctx() try: # create all instances conductor.cluster_update(ctx, cluster, {"status": "Spawning"}) LOG.info(g.format_cluster_status(cluster)) self._create_instances(cluster) # wait for all instances are up and networks ready cluster = conductor.cluster_update(ctx, cluster, {"status": "Waiting"}) LOG.info(g.format_cluster_status(cluster)) instances = g.get_instances(cluster) self._await_active(cluster, instances) self._assign_floating_ips(instances) self._await_networks(cluster, instances) cluster = conductor.cluster_get(ctx, cluster) # attach volumes volumes.attach(cluster) # prepare all instances cluster = conductor.cluster_update(ctx, cluster, {"status": "Preparing"}) LOG.info(g.format_cluster_status(cluster)) self._configure_instances(cluster) except Exception as ex: with excutils.save_and_reraise_exception(): self._log_operation_exception( "Can't start cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_update( ctx, cluster, { "status": "Error", "status_description": str(ex) }) LOG.info(g.format_cluster_status(cluster)) self._rollback_cluster_creation(cluster, ex)
def __iter__(self): """Return a result until we get a reply with an 'ending' flag.""" if self._done: raise StopIteration while True: try: data = self._dataqueue.get(timeout=self._timeout) result = self._process_data(data) except queue.Empty: self.done() raise rpc_common.Timeout() except Exception: with excutils.save_and_reraise_exception(): self.done() if self._got_ending: self.done() raise StopIteration if isinstance(result, Exception): self.done() raise result yield result
def scale_cluster(cluster, node_group_id_map, plugin): ctx = context.ctx() instances_list = [] try: instances_list = _scale_cluster_instances(cluster, node_group_id_map, plugin) cluster = conductor.cluster_get(ctx, cluster) cluster = clean_cluster_from_empty_ng(cluster) cluster = _await_instances(cluster) volumes.attach_to_instances(get_instances(cluster, instances_list)) except Exception as ex: LOG.warn("Can't scale cluster '%s' (reason: %s)", cluster.name, ex) with excutils.save_and_reraise_exception(): cluster = conductor.cluster_get(ctx, cluster) _rollback_cluster_scaling(cluster, get_instances(cluster, instances_list), ex) instances_list = [] cluster = conductor.cluster_get(ctx, cluster) clean_cluster_from_empty_ng(cluster) if cluster.status == 'Decommissioning': cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) else: cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) # we should be here with valid cluster: if instances creation # was not successful all extra-instances will be removed above if instances_list: _configure_instances(cluster) return instances_list
def scale_cluster(self, cluster, target_count): ctx = context.ctx() rollback_count = self._get_ng_counts(cluster) launcher = _ScaleLauncher() try: launcher.launch_instances(ctx, cluster, target_count) except Exception as ex: with excutils.save_and_reraise_exception(): self._log_operation_exception( "Can't scale cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_get(ctx, cluster) try: self._rollback_cluster_scaling(ctx, cluster, rollback_count, target_count) except Exception: # if something fails during the rollback, we stop # doing anything further cluster = conductor.cluster_update(ctx, cluster, {"status": "Error"}) LOG.info(g.format_cluster_status(cluster)) LOG.error("Unable to complete rollback, aborting") raise cluster = conductor.cluster_update(ctx, cluster, {"status": "Active"}) LOG.info(g.format_cluster_status(cluster)) LOG.warn( "Rollback successful. Throwing off an initial exception.") finally: cluster = conductor.cluster_get(ctx, cluster) g.clean_cluster_from_empty_ng(cluster) return launcher.inst_ids
def _cluster_config_testing(self, cluster_info): cluster_id = cluster_info['cluster_id'] data = self.savanna.clusters.get(cluster_id) self.__compare_configs( data.cluster_configs['general'], {'Enable Swift': True} ) self.__compare_configs( data.cluster_configs['HDFS'], CLUSTER_HDFS_CONFIG ) self.__compare_configs( data.cluster_configs['MapReduce'], CLUSTER_MR_CONFIG ) node_groups = data.node_groups self.__check_configs_for_node_groups(node_groups) node_ip_list_with_node_processes = \ self.get_cluster_node_ip_list_with_node_processes(cluster_id) try: self.transfer_helper_script_to_nodes( node_ip_list_with_node_processes, self.vanilla_config.NODE_USERNAME, 'cluster_config_test_script.sh' ) except Exception as e: with excutils.save_and_reraise_exception(): print(str(e)) self.__check_config_application_on_cluster_nodes( node_ip_list_with_node_processes )
def create_cluster(self, cluster): ctx = context.ctx() launcher = _CreateLauncher() try: target_count = self._get_ng_counts(cluster) self._nullify_ng_counts(cluster) cluster = conductor.cluster_get(ctx, cluster) launcher.launch_instances(ctx, cluster, target_count) except Exception as ex: with excutils.save_and_reraise_exception(): self._log_operation_exception( "Can't start cluster '%s' (reason: %s)", cluster, ex) cluster = conductor.cluster_update( ctx, cluster, { "status": "Error", "status_description": str(ex) }) LOG.info(g.format_cluster_status(cluster)) self._rollback_cluster_creation(cluster)