def run_task(self, ip_address, process_id, settings, run_settings): """ Start the task on the instance, then hang and periodically check its state. """ logger.debug("run_task %s" % ip_address) #ip = botocloudconnector.get_instance_ip(instance_id, settings) #ip = ip_address logger.debug("ip=%s" % ip_address) # curr_username = settings['username'] #settings['username'] = '******' # ssh = sshconnector.open_connection(ip_address=ip, # settings=settings) # settings['username'] = curr_username #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id relative_path_suffix = self.get_relative_output_path(settings) relative_path = settings['type'] + '@' + os.path.join(relative_path_suffix, process_id) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip_address) makefile_path = get_make_path(destination) try: ssh = open_connection(ip_address=ip_address, settings=settings) command, errs = run_make(ssh, makefile_path, 'start_running_process') logger.debug('execute_command=%s' % command ) finally: ssh.close()
def start_round_robin_reschedule(nodes, procs_2b_rescheduled, current_procs, settings, output_storage_settings, relative_path_suffix): total_nodes = len(nodes) all_nodes = list(nodes) processes = len(procs_2b_rescheduled) if total_nodes > processes: total_nodes = processes all_nodes = nodes[:total_nodes] if total_nodes == 0: return proc_per_node = processes / total_nodes remaining_procs = processes % total_nodes index = 0 new_processes = current_procs rescheduled_procs = list(procs_2b_rescheduled) for cur_node in all_nodes: logger.debug('Schedule here %s' % cur_node) ip_address = cur_node.ip_address if not ip_address: ip_address = cur_node.private_ip_address logger.debug('ip_address=%s' % ip_address) #relative_path = output_storage_settings['type'] + '@' + settings['payload_destination'] relative_path = output_storage_settings[ 'type'] + '@' + relative_path_suffix procs_on_cur_node = proc_per_node if remaining_procs: procs_on_cur_node = proc_per_node + 1 remaining_procs -= 1 logger.debug('procs_cur_node=%d' % procs_on_cur_node) ids = get_procs_ids(procs_on_cur_node, rescheduled_procs=rescheduled_procs) #index += len(ids) #logger.debug('index=%d' % index) put_proc_ids(relative_path, ids, ip_address, settings) new_processes = construct_lookup_table(ids, ip_address, new_processes, status='reschedule_ready', maximum_retry=int( settings['maximum_retry'])) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip_address) logger.debug('schedule destination=%s' % destination) makefile_path = get_make_path(destination) logger.debug('makefile_path=%s' % makefile_path) command = "cd %s; make %s" % ( makefile_path, 'start_schedule PAYLOAD_NAME=%s IDS=%s' % (settings['payload_name'], settings['filename_for_PIDs'])) command_out = '' errs = '' logger.debug("starting command for %s" % ip_address) try: ssh = open_connection(ip_address=ip_address, settings=settings) command_out, errs = run_command_with_status(ssh, command) except Exception, e: logger.error(e) finally:
def start_round_robin_reschedule(nodes, procs_2b_rescheduled, current_procs, settings, output_storage_settings, relative_path_suffix): total_nodes = len(nodes) all_nodes = list(nodes) processes = len(procs_2b_rescheduled) if total_nodes > processes: total_nodes = processes all_nodes = nodes[:total_nodes] if total_nodes == 0: return proc_per_node = processes / total_nodes remaining_procs = processes % total_nodes index = 0 new_processes = current_procs rescheduled_procs = list(procs_2b_rescheduled) for cur_node in all_nodes: logger.debug('Schedule here %s' % cur_node) ip_address = cur_node.ip_address if not ip_address: ip_address = cur_node.private_ip_address logger.debug('ip_address=%s' % ip_address) #relative_path = output_storage_settings['type'] + '@' + settings['payload_destination'] relative_path = output_storage_settings['type'] + '@' + relative_path_suffix procs_on_cur_node = proc_per_node if remaining_procs: procs_on_cur_node = proc_per_node + 1 remaining_procs -= 1 logger.debug('procs_cur_node=%d' % procs_on_cur_node) ids = get_procs_ids(procs_on_cur_node, rescheduled_procs=rescheduled_procs) #index += len(ids) #logger.debug('index=%d' % index) put_proc_ids(relative_path, ids, ip_address, settings) new_processes = construct_lookup_table( ids, ip_address, new_processes, status='reschedule_ready', maximum_retry=int(settings['maximum_retry'])) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip_address) logger.debug('schedule destination=%s' % destination) makefile_path = get_make_path(destination) logger.debug('makefile_path=%s' % makefile_path) command = "cd %s; make %s" % (makefile_path, 'start_schedule %s %s %s' % (settings['payload_name'], settings['filename_for_PIDs'], settings['process_output_dirname'], settings['smart_connector_input'])) command_out = '' errs = '' logger.debug("starting command for %s" % ip_address) try: ssh = open_connection(ip_address=ip_address, settings=settings) command_out, errs = run_command_with_status(ssh, command) except Exception, e: logger.error(e) finally:
def run_command(command, ip_address, settings): ssh = open_connection(ip_address=ip_address, settings=settings) try: output, err = run_command_with_status(ssh, command) finally: ssh.close() return (output, err)
def run_task(self, ip_address, process_id, settings, run_settings): """ Start the task on the instance, then hang and periodically check its state. """ logger.debug("run_task %s" % ip_address) #ip = botocloudconnector.get_instance_ip(instance_id, settings) #ip = ip_address logger.debug("ip=%s" % ip_address) # curr_username = settings['username'] #settings['username'] = '******' # ssh = sshconnector.open_connection(ip_address=ip, # settings=settings) # settings['username'] = curr_username #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id relative_path_suffix = self.get_relative_output_path(settings) relative_path = settings['type'] + '@' + \ os.path.join(relative_path_suffix, process_id) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip_address) makefile_path = get_make_path(destination) try: ssh = open_connection(ip_address=ip_address, settings=settings) logger.debug(settings['process_output_dirname']) try: self.hadoop_input = 'HADOOP_INPUT_%s' % self.contextid self.hadoop_output = 'HADOOP_OUTPUT_%s' % self.contextid hadoop = run_settings['%s/input/system/compplatform/hadoop' % django_settings.SCHEMA_PREFIX] sudo = False options = '%s %s %s %s %s ' % ( settings['smart_connector_input'], settings['process_output_dirname'], settings['hadoop_home_path'], self.hadoop_input, self.hadoop_output) logger.debug('options = %s ' % options) optional_args = self.get_optional_args(run_settings) if optional_args: options += " %s" % optional_args logger.debug('options = %s ' % options) command, errs = run_make(ssh, makefile_path, 'start_running_process %s' % options, sudo=sudo) except KeyError: sudo = True command, errs = run_make(ssh, makefile_path, 'start_running_process %s %s' % (settings['smart_connector_input'], settings['process_output_dirname']), sudo=sudo) logger.debug('execute_command=%s' % command) finally: ssh.close()
def start_round_robin_schedule(nodes, processes, schedule_index, settings, relative_path_suffix): total_nodes = len(nodes) all_nodes = list(nodes) if total_nodes > processes: total_nodes = processes all_nodes = nodes[:total_nodes] if total_nodes == 0: return proc_per_node = processes / total_nodes remaining_procs = processes % total_nodes index = schedule_index new_processes = [] for cur_node in all_nodes: ip_address = cur_node[1] #relative_path = settings['type'] + '@' + settings['payload_destination'] relative_path = settings['type'] + '@' + relative_path_suffix procs_on_cur_node = proc_per_node if remaining_procs: procs_on_cur_node = proc_per_node + 1 remaining_procs -= 1 logger.debug('procs_cur_node=%d' % procs_on_cur_node) ids = get_procs_ids(procs_on_cur_node, index=index) index += len(ids) logger.debug('index=%d' % index) put_proc_ids(relative_path, ids, ip_address, settings) new_processes = construct_lookup_table( ids, ip_address, new_processes, maximum_retry=int(settings['maximum_retry'])) destination = get_url_with_credentials( settings, relative_path, is_relative_path=True, ip_address=ip_address) logger.debug('schedule destination=%s' % destination) makefile_path = get_make_path(destination) logger.debug('makefile_path=%s' % makefile_path) command = "cd %s; make %s" % (makefile_path, 'start_schedule %s %s %s %s' % ( settings['payload_name'], settings['filename_for_PIDs'], settings['process_output_dirname'], settings['smart_connector_input'])) command_out = '' errs = '' logger.debug("starting command for %s" % ip_address) try: ssh = open_connection(ip_address=ip_address, settings=settings) command_out, errs = run_command_with_status(ssh, command) except Exception, e: logger.error(e) finally:
def _is_ssh_ready(settings, ip_address): ssh_ready = False #maximum rwait time 3 minutes minutes = 3 #fixme avoid hard coding; move to settings.py max_retries = (minutes * 60) / settings['cloud_sleep_interval'] retries = 0 while not ssh_ready and retries < max_retries: logger.debug("Connecting to %s in progress ..." % ip_address) try: open_connection(ip_address, settings) ssh_ready = True except Exception as ex: logger.debug("[%s] Exception: %s" % (ip_address, ex)) if 'Connection refused' in ex: # FIXME: this doesn't always work. pass elif 'Authentication failed' in ex: pass else: retries += 1 time.sleep(settings['cloud_sleep_interval']) logger.debug("Connecting to %s completed" % ip_address) return ssh_ready
def start_round_robin_schedule(nodes, processes, schedule_index, settings, relative_path_suffix): total_nodes = len(nodes) all_nodes = list(nodes) if total_nodes > processes: total_nodes = processes all_nodes = nodes[:total_nodes] if total_nodes == 0: return proc_per_node = processes / total_nodes remaining_procs = processes % total_nodes index = schedule_index new_processes = [] for cur_node in all_nodes: ip_address = cur_node[1] #relative_path = settings['type'] + '@' + settings['payload_destination'] relative_path = settings['type'] + '@' + relative_path_suffix procs_on_cur_node = proc_per_node if remaining_procs: procs_on_cur_node = proc_per_node + 1 remaining_procs -= 1 logger.debug('procs_cur_node=%d' % procs_on_cur_node) ids = get_procs_ids(procs_on_cur_node, index=index) index += len(ids) logger.debug('index=%d' % index) put_proc_ids(relative_path, ids, ip_address, settings) new_processes = construct_lookup_table( ids, ip_address, new_processes, maximum_retry=int(settings['maximum_retry'])) destination = get_url_with_credentials( settings, relative_path, is_relative_path=True, ip_address=ip_address) logger.debug('schedule destination=%s' % destination) makefile_path = get_make_path(destination) logger.debug('makefile_path=%s' % makefile_path) command = "cd %s; make %s" % (makefile_path, 'start_schedule PAYLOAD_NAME=%s IDS=%s' % ( settings['payload_name'], settings['filename_for_PIDs'])) command_out = '' errs = '' logger.debug("starting command for %s" % ip_address) try: ssh = open_connection(ip_address=ip_address, settings=settings) command_out, errs = run_command_with_status(ssh, command) except Exception, e: logger.error(e) finally:
def _is_ssh_ready(settings, ip_address): ssh_ready = False #maximum rwait time 3 minutes minutes = 3 #fixme avoid hard coding; move to settings.py max_retries = (minutes * 60)/settings['cloud_sleep_interval'] retries = 0 while not ssh_ready and retries < max_retries: logger.debug("Connecting to %s in progress ..." % ip_address) try: open_connection(ip_address, settings) ssh_ready = True except Exception as ex: logger.debug("[%s] Exception: %s" % (ip_address, ex)) if 'Connection refused' in ex: # FIXME: this doesn't always work. pass elif 'Authentication failed' in ex: pass else: retries += 1 time.sleep(settings['cloud_sleep_interval']) logger.debug("Connecting to %s completed" % ip_address) return ssh_ready
def run_task(self, ip_address, process_id, settings, run_settings): """ Start the task on the instance, then hang and periodically check its state. """ logger.debug("run_task %s" % ip_address) #ip = botocloudconnector.get_instance_ip(instance_id, settings) #ip = ip_address logger.debug("ip=%s" % ip_address) # curr_username = settings['username'] #settings['username'] = '******' # ssh = sshconnector.open_connection(ip_address=ip, # settings=settings) # settings['username'] = curr_username #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id relative_path_suffix = self.get_relative_output_path(settings) relative_path = settings['type'] + '@' + \ os.path.join(relative_path_suffix, process_id) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip_address) makefile_path = get_make_path(destination) try: ssh = open_connection(ip_address=ip_address, settings=settings) logger.debug(settings['process_output_dirname']) try: self.hadoop_input = 'HADOOP_INPUT_%s' % self.contextid self.hadoop_output = 'HADOOP_OUTPUT_%s' % self.contextid hadoop = run_settings['%s/input/system/compplatform/hadoop' % django_settings.SCHEMA_PREFIX] sudo = False options = '%s %s %s %s %s ' % (settings['smart_connector_input'], settings['process_output_dirname'], settings['hadoop_home_path'], self.hadoop_input, self.hadoop_output) logger.debug('options = %s ' % options) optional_args = self.get_optional_args(run_settings) if optional_args: options += " %s" % optional_args logger.debug('options = %s ' % options) command, errs = run_make(ssh, makefile_path, 'start_running_process %s' % options, sudo= sudo ) except KeyError: sudo = True command, errs = run_make( ssh, makefile_path, 'start_running_process %s %s' % (settings['smart_connector_input'], settings['process_output_dirname']), sudo= sudo) logger.debug('execute_command=%s' % command ) finally: ssh.close()
def _is_bootstrap_complete(ip, settings, destination): """ Return True if package job on instance_id has is_job_finished """ ssh = open_connection(ip_address=ip, settings=settings) makefile_path = get_make_path(destination) (command_out, err) = run_make(ssh, makefile_path, 'bootstrap_done') if command_out: logger.debug("command_out = %s" % command_out) for line in command_out: if 'Environment Setup Completed' in line: return True else: logger.warn(err) return False
def _is_schedule_complete(ip, settings, destination): """ Return True if package job on instance_id has is_job_finished """ ssh = open_connection(ip_address=ip, settings=settings) makefile_path = get_make_path(destination) command = "cd %s; make %s" % (makefile_path, 'schedule_done IDS=%s' % (settings['filename_for_PIDs'])) command_out, _ = run_command_with_status(ssh, command) logger.debug('command=%s' % command) if command_out: logger.debug("command_out = %s" % command_out) for line in command_out: if 'All processes are scheduled' in line: return True return False
def _is_schedule_complete(ip, settings, destination): """ Return True if package job on instance_id has is_job_finished """ ssh = open_connection(ip_address=ip, settings=settings) makefile_path = get_make_path(destination) command = "cd %s; make %s" % (makefile_path, 'schedule_done IDS=%s' % ( settings['filename_for_PIDs'])) command_out, _ = run_command_with_status(ssh, command) logger.debug('command=%s' % command) if command_out: logger.debug("command_out = %s" % command_out) for line in command_out: if 'All processes are scheduled' in line: return True return False
def _job_finished(self, settings, remote_path): encoded_d_url = storage.get_url_with_credentials( settings=settings, url_or_relative_path=remote_path, is_relative_path=True, ip_address=settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_d_url) stdout = '' stderr = '' try: ssh = open_connection(ip_address=host, settings=settings) (stdout, stderr) = compute.run_make( ssh, (os.path.join(query_settings['root_path'], mypath)), 'running') except Exception, e: logger.error(e) raise
def _start_bootstrap(instance, ip, settings, source, destination): """ Start the task on the instance, then return """ logger.info("run_task %s" % str(instance)) copy_directories(source, destination) makefile_path = get_make_path(destination) # TODO, FIXME: need to have timeout for yum install make # and then test can access, otherwise, loop. install_make = 'yum install -y make' command_out = '' errs = '' logger.debug("starting command for %s" % ip) ssh = '' try: ssh = open_connection(ip_address=ip, settings=settings) command_out, errs = run_command_with_status(ssh, install_make) logger.debug("command_out1=(%s, %s)" % (command_out, errs)) run_make(ssh, makefile_path, 'start_bootstrap') except Exception, e: #fixme: consider using reliability framework logger.error(e) raise
def _job_finished(self, settings, remote_path): encoded_d_url = storage.get_url_with_credentials( settings=settings, url_or_relative_path=remote_path, is_relative_path=True, ip_address=settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_d_url) stdout = '' stderr = '' try: ssh = open_connection(ip_address=host, settings=settings) (stdout, stderr) = compute.run_make(ssh, (os.path.join( query_settings['root_path'], mypath)), 'running') except Exception, e: logger.error(e) raise
def _start_bootstrap(instance, ip, settings, source, destination): """ Start the task on the instance, then return """ logger.info("run_task %s" % str(instance)) copy_directories(source, destination) makefile_path = get_make_path(destination) # TODO, FIXME: need to have timeout for yum install make # and then test can access, otherwise, loop. install_make = 'yum install -y make' command_out = '' errs = '' logger.debug("starting command for %s" % ip) ssh = '' try: ssh = open_connection(ip_address=ip, settings=settings) command_out, errs = run_command_with_status(ssh, install_make) logger.debug("command_out1=(%s, %s)" % (command_out, errs)) run_make(ssh, makefile_path, 'start_bootstrap') except Exception, e:#fixme: consider using reliability framework logger.error(e) raise
def process(self, run_settings): settings = setup_settings(run_settings) messages.info(run_settings, "1: execute starting") def _get_dest_bdp_url(settings): return "%s@%s" % ( "nci", os.path.join(settings['payload_destination'], str(settings['contextid']))) dest_url = _get_dest_bdp_url(settings) computation_platform_url = settings['comp_platform_url'] bdp_username = settings['bdp_username'] comp_pltf_settings = manage.get_platform_settings( computation_platform_url, bdp_username) logger.debug("comp_pltf_settings=%s" % pformat(comp_pltf_settings)) settings.update(comp_pltf_settings) encoded_d_url = storage.get_url_with_credentials( settings, dest_url, is_relative_path=True, ip_address=settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_d_url) stderr = '' try: ssh = open_connection( ip_address=settings['host'], settings=settings) (command_out, stderr) = compute.run_make(ssh, (os.path.join( query_settings['root_path'], mypath)), 'startrun') except Exception, e: logger.error(e) raise
def generate_rfs_key(parameters): key_generated = True message = 'Key generated successfully' password = '' if 'password' in parameters.keys(): password = parameters['password'] ssh_settings = {'username': parameters['username'], 'port': parameters['port'], 'password': password} storage_settings = {'params': ssh_settings, 'host': parameters['ip_address'], 'root': "/"} bdp_root_path = storage.get_bdp_root_path() key_name_org = os.path.splitext(os.path.basename(parameters['private_key_path']))[0] key_name = key_name_org private_key_absolute_path = os.path.join(bdp_root_path, parameters['private_key_path']) key_dir = os.path.dirname(private_key_absolute_path) if not os.path.exists(key_dir): os.makedirs(key_dir) counter = 1 while os.path.exists(os.path.join(key_dir, key_name)): key_name = '%s_%d' % (key_name_org, counter) counter += 1 parameters['private_key_path'] = os.path.join(os.path.dirname( parameters['private_key_path']), key_name) private_key_absolute_path = os.path.join(bdp_root_path, parameters['private_key_path']) public_key_absolute_path = '%s.pub' % private_key_absolute_path remote_key_path = os.path.join(parameters['home_path'], '.ssh', ('%s.pub' % key_name)) authorized_remote_path = os.path.join(parameters['home_path'], '.ssh', 'authorized_keys') try: private_key = paramiko.RSAKey.generate(1024) private_key.write_private_key_file(private_key_absolute_path) public_key = paramiko.RSAKey(filename=private_key_absolute_path) public_key_content = '%s %s' % (public_key.get_name(), public_key.get_base64()) f = open(public_key_absolute_path, 'w') f.write("\n%s\n" % public_key_content) f.close() fs = storage.RemoteStorage(settings=storage_settings) fs.save(remote_key_path, ContentFile(public_key_content)) ssh_client = open_connection(parameters['ip_address'], ssh_settings) #command = 'cat %s >> %s' % (remote_key_path, authorized_remote_path) space = " " command = 'echo %s >> %s; echo %s >> %s; echo %s >> %s; chmod 600 %s' % ( space, authorized_remote_path, public_key_content, authorized_remote_path, space, authorized_remote_path, authorized_remote_path) command_out, errs = run_command_with_status(ssh_client, command) if errs: if 'Permission denied' in errs: key_generated = False message = 'Permission denied to copy public key to %s/.ssh/authorized_keys' % parameters['home_path'] else: raise IOError except AuthError: key_generated = False message = 'Unauthorized access to %s' % parameters['ip_address'] except socket.gaierror, e: key_generated = False if 'Name or service not known' in e: message = 'Unknown IP address [%s]' % parameters['ip_address'] else: message = '[%s]: %s, %s' % (parameters['ip_address'], e.__doc__, e.strerror)
def is_job_finished(self, wait_class, ip_address, process_id, retry_left, settings, relative_path_suffix): """ Return True if package job on instance_id has is_job_finished """ # TODO: maybe this should be a reusable library method? ip = ip_address logger.debug("ip=%s" % ip) curr_username = settings['username'] # settings['username'] = '******' #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id relative_path = settings['type'] + '@' + os.path.join( relative_path_suffix, process_id) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip) makefile_path = get_make_path(destination) ssh = None try: logger.debug('trying ssh') ssh = open_connection(ip_address=ip, settings=settings) logger.debug('successful ssh') (command_out, errs) = run_make(ssh, makefile_path, "process_running_done") ssh.close() logger.debug("command_out2=(%s, %s)" % (command_out, errs)) if command_out: logger.debug("command_out = %s" % command_out) for line in command_out: if "stopped" in line: return True except Exception, e: # Failure detection and then management logger.debug('error is = %s' % e) process_failed = False node_failed = False logger.debug('Is there error? %s' % wait_class.failure_detector.failed_ssh_connection(e)) if wait_class.failure_detector.failed_ssh_connection(e): node = [ x for x in wait_class.created_nodes if x[1] == ip_address ] wait_class.failed_processes = wait_class.ftmanager.manage_failed_process( settings, process_id, node[0], node[0][0], ip_address, wait_class.failed_nodes, wait_class.executed_procs, wait_class.current_processes, wait_class.all_processes, wait_class.procs_2b_rescheduled) #wait_class.procs_2b_rescheduled.extend(rescheduled_prcs) ''' if wait_class.failure_detector.node_terminated(settings, node[0][0]): if not wait_class.failure_detector.recorded_failed_node( wait_class.failed_nodes, ip_address): wait_class.failed_nodes.append(node[0]) node_failed = True else: if not retry_left: process_failed = True else: process_lists = [wait_class.executed_procs, wait_class.current_processes, wait_class.all_processes] wait_class.ftmanager.decrease_max_retry( process_lists, ip_address, process_id) # Failure management if node_failed or process_failed: process_lists = [wait_class.executed_procs, wait_class.current_processes, wait_class.all_processes] if node_failed: wait_class.ftmanager.flag_all_processes(process_lists, ip_address) elif process_failed: wait_class.ftmanager.flag_this_process( process_lists, ip_address, process_id) wait_class.failed_processes = wait_class.ftmanager.\ get_total_failed_processes(wait_class.executed_procs) if wait_class.reschedule_failed_procs: wait_class.ftmanager.collect_failed_processes( wait_class.executed_procs, wait_class.procs_2b_rescheduled) ''' else: raise
def generate_unix_key(parameters): key_generated = True message = 'Key generated successfully' password = '' if 'password' in parameters.keys(): password = parameters['password'] ssh_settings = {'username': parameters['username'], 'password': password} storage_settings = { 'params': ssh_settings, 'host': parameters['ip_address'], 'root': "/" } bdp_root_path = storage.get_bdp_root_path() key_name_org = os.path.splitext( os.path.basename(parameters['private_key_path']))[0] key_name = key_name_org private_key_absolute_path = os.path.join(bdp_root_path, parameters['private_key_path']) key_dir = os.path.dirname(private_key_absolute_path) if not os.path.exists(key_dir): os.makedirs(key_dir) counter = 1 while os.path.exists(os.path.join(key_dir, key_name)): key_name = '%s_%d' % (key_name_org, counter) counter += 1 parameters['private_key_path'] = os.path.join( os.path.dirname(parameters['private_key_path']), key_name) private_key_absolute_path = os.path.join(bdp_root_path, parameters['private_key_path']) public_key_absolute_path = '%s.pub' % private_key_absolute_path remote_key_path = os.path.join(parameters['home_path'], '.ssh', ('%s.pub' % key_name)) authorized_remote_path = os.path.join(parameters['home_path'], '.ssh', 'authorized_keys') try: private_key = paramiko.RSAKey.generate(1024) private_key.write_private_key_file(private_key_absolute_path) public_key = paramiko.RSAKey(filename=private_key_absolute_path) public_key_content = '%s %s' % (public_key.get_name(), public_key.get_base64()) f = open(public_key_absolute_path, 'w') f.write("\n%s\n" % public_key_content) f.close() fs = storage.RemoteStorage(settings=storage_settings) fs.save(remote_key_path, ContentFile(public_key_content)) ssh_client = open_connection(parameters['ip_address'], ssh_settings) #command = 'cat %s >> %s' % (remote_key_path, authorized_remote_path) space = " " command = 'echo %s >> %s; echo %s >> %s; echo %s >> %s; chmod 600 %s' % ( space, authorized_remote_path, public_key_content, authorized_remote_path, space, authorized_remote_path, authorized_remote_path) command_out, errs = run_command_with_status(ssh_client, command) if errs: if 'Permission denied' in errs: key_generated = False message = 'Permission denied to copy public key to %s/.ssh/authorized_keys' % parameters[ 'home_path'] else: raise IOError except AuthError: key_generated = False message = 'Unauthorized access to %s' % parameters['ip_address'] except socket.gaierror, e: key_generated = False if 'Name or service not known' in e: message = 'Unknown IP address [%s]' % parameters['ip_address'] else: message = '[%s]: %s, %s' % (parameters['ip_address'], e.__doc__, e.strerror)
def is_job_finished(self, ip_address, process_id, retry_left, settings, relative_path_suffix): """ Return True if package job on instance_id has is_job_finished """ # TODO: maybe this should be a reusable library method? ip = ip_address logger.debug("ip=%s" % ip) curr_username = settings['username'] settings['username'] = '******' #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id relative_path = settings['type'] + '@' + os.path.join(relative_path_suffix, process_id) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip) makefile_path = get_make_path(destination) ssh = None try: logger.debug('trying ssh') ssh = open_connection(ip_address=ip, settings=settings) logger.debug('successful ssh') (command_out, errs) = run_make(ssh, makefile_path, "process_running_done") ssh.close() logger.debug("command_out2=(%s, %s)" % (command_out, errs)) if command_out: logger.debug("command_out = %s" % command_out) for line in command_out: if "stopped" in line: return True except Exception, e: # Failure detection and then management logger.debug('error is = %s' % e) process_failed = False node_failed = False logger.debug('Is there error? %s' % self.failure_detector.failed_ssh_connection(e)) if self.failure_detector.failed_ssh_connection(e): node = [x for x in self.created_nodes if x[1] == ip_address] self.failed_processes = self.ftmanager.manage_failed_process( settings, process_id, node[0], node[0][0], ip_address, self.failed_nodes, self.executed_procs, self.current_processes, self.all_processes, self.procs_2b_rescheduled) #self.procs_2b_rescheduled.extend(rescheduled_prcs) ''' if self.failure_detector.node_terminated(settings, node[0][0]): if not self.failure_detector.recorded_failed_node( self.failed_nodes, ip_address): self.failed_nodes.append(node[0]) node_failed = True else: if not retry_left: process_failed = True else: process_lists = [self.executed_procs, self.current_processes, self.all_processes] self.ftmanager.decrease_max_retry( process_lists, ip_address, process_id) # Failure management if node_failed or process_failed: process_lists = [self.executed_procs, self.current_processes, self.all_processes] if node_failed: self.ftmanager.flag_all_processes(process_lists, ip_address) elif process_failed: self.ftmanager.flag_this_process( process_lists, ip_address, process_id) self.failed_processes = self.ftmanager.\ get_total_failed_processes(self.executed_procs) if self.reschedule_failed_procs: self.ftmanager.collect_failed_processes( self.executed_procs, self.procs_2b_rescheduled) ''' else: raise