def _upload_variations_inputs(settings, source_url_initial, values_map): bdp_username = settings['bdp_username'] logger.debug("source_url_initial=%s" % source_url_initial) encoded_s_url = storage.get_url_with_credentials(settings, source_url_initial) logger.debug("encoded_s_url=%s" % encoded_s_url) dest_url = _get_dest_bdp_url(settings) computation_platform_url = settings['comp_platform_url'] bdp_username = settings['bdp_username'] comp_pltf_settings = manage.get_platform_settings( computation_platform_url, bdp_username) settings.update(comp_pltf_settings) encoded_d_url = storage.get_url_with_credentials(settings, dest_url, is_relative_path=True, ip_address=settings['host']) storage.copy_directories(encoded_s_url, encoded_d_url) for content_fname, content in _instantiate_context( source_url_initial, settings, values_map).items(): content_url = storage.get_url_with_credentials( settings, os.path.join(dest_url, content_fname), is_relative_path=True, ip_address=settings['host']) logger.debug("content_url=%s" % content_url) storage.put_file(content_url, content.encode('utf-8')) _save_values(settings, dest_url, values_map) logger.debug("done input upload")
def _instantiate_context(source_url, settings, context): templ_pat = re.compile("(.*)_template") encoded_s_url = storage.get_url_with_credentials(settings, source_url, is_relative_path=False) logger.debug("encoded_s_url=%s" % encoded_s_url) fnames = storage.list_dirs(encoded_s_url, list_files=True) logger.debug("fnames=%s" % fnames) new_content = {} for fname in fnames: logger.debug("fname=%s" % fname) templ_mat = templ_pat.match(fname) if templ_mat: base_fname = templ_mat.group(1) basename_url_with_pkey = storage.get_url_with_credentials( settings, os.path.join( source_url, fname), is_relative_path=False) logger.debug("basename_url_with_pkey=%s" % basename_url_with_pkey) cont = storage.get_file(basename_url_with_pkey) try: t = Template(cont) except TemplateSyntaxError, e: logger.error(e) #FIXME: should detect this during submission of job, #as no sensible way to recover here. #TODO: signal error conditions in job status continue con = Context(context) logger.debug("context=%s" % context) new_content[base_fname] = t.render(con)
def copy_files_with_pattern(iter_out_fsys, source_path, dest_path, pattern, all_settings): """ """ output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) logger.debug('source_path=%s, dest_path=%s' % (source_path, dest_path)) # (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(source_path) _, node_output_fnames = iter_out_fsys.listdir(source_path) ip_address = all_settings['ip_address'] for f in node_output_fnames: if fnmatch.fnmatch(f, pattern): source_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(ip_address, source_path, f), is_relative_path=False) dest_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(ip_address, dest_path, f), is_relative_path=False) logger.debug('source_url=%s, dest_url=%s' % (source_url, dest_url)) content = storage.get_file(source_url) storage.put_file(dest_url, content)
def copy_to_scratch_space(self, run_settings, local_settings): bdp_username = run_settings[ 'http://rmit.edu.au/schemas/bdp_userprofile']['username'] output_storage_url = run_settings[ 'http://rmit.edu.au/schemas/platform/storage/output'][ 'platform_url'] output_storage_settings = manage.get_platform_settings( output_storage_url, bdp_username) run_settings['http://rmit.edu.au/schemas/platform/storage/output'][ 'offset'] = self.output_loc_offset offset = run_settings[ 'http://rmit.edu.au/schemas/platform/storage/output']['offset'] self.job_dir = manage.get_job_dir(output_storage_settings, offset) iter_inputdir = os.path.join(self.job_dir, "input_0") logger.debug("iter_inputdir=%s" % iter_inputdir) input_location = run_settings[RMIT_SCHEMA + '/input/system']['input_location'] logger.debug("input_location=%s" % input_location) #todo: input location will evenatually be replaced by the scratch space that was used by the sweep #todo: the sweep will indicate the location of the scratch space in the run_settings #todo: add scheme (ssh) to inputlocation source_url = get_url_with_credentials(local_settings, input_location) logger.debug("source_url=%s" % source_url) destination_url = get_url_with_credentials( output_storage_settings, '%s://%s@%s' % (output_storage_settings['scheme'], output_storage_settings['type'], iter_inputdir), is_relative_path=False) logger.debug("destination_url=%s" % destination_url) storage.copy_directories(source_url, destination_url)
def _copy_previous_inputs(self, local_settings, output_storage_settings, computation_platform_settings): output_prefix = '%s://%s@' % (output_storage_settings['scheme'], output_storage_settings['type']) for proc in self.ready_processes: source_location = os.path.join(self.job_dir, "input_backup", proc['id']) source_files_url = get_url_with_credentials( output_storage_settings, output_prefix + source_location, is_relative_path=False) relative_path_suffix = self.get_relative_output_path( local_settings) #dest_files_location = computation_platform_settings['type'] + "@"\ # + os.path.join( # local_settings['payload_destination'], # proc['id'], local_settings['payload_cloud_dirname']) dest_files_location = computation_platform_settings['type'] + "@"\ + os.path.join(relative_path_suffix, proc['id'], local_settings['payload_cloud_dirname']) logger.debug('dest_files_location=%s' % dest_files_location) dest_files_url = get_url_with_credentials( computation_platform_settings, dest_files_location, is_relative_path=True, ip_address=proc['ip_address']) logger.debug('dest_files_url=%s' % dest_files_url) storage.copy_directories(source_files_url, dest_files_url)
def _get_output(self, local_settings, source_url): """ Retrieve the output from the task on the node """ logger.debug("get_output from %s" % source_url) computation_platform_url = local_settings['comp_platform_url'] bdp_username = local_settings['bdp_username'] comp_pltf_settings = manage.get_platform_settings( computation_platform_url, bdp_username) local_settings.update(comp_pltf_settings) encoded_s_url = storage.get_url_with_credentials( local_settings, source_url, is_relative_path=True, ip_address=local_settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_s_url) make_path = os.path.join(query_settings['root_path'], mypath) logger.debug("make_path=%s" % make_path) output_storage_url = local_settings['storeout_platform_url'] logger.debug("output_storage_url=%s" % output_storage_url) output_storage_settings = manage.get_platform_settings( output_storage_url, bdp_username) local_settings.update(output_storage_settings) logger.debug("output_storage_settings=%s" % output_storage_settings) dest_url = '%s://%s@%s/%s/make%s' % ( output_storage_settings['scheme'], output_storage_settings['type'], output_storage_settings['host'], local_settings['storeout_platform_offset'], str(local_settings['contextid'])) logger.debug("Transferring output from %s to %s" % (source_url, dest_url)) local_settings.update(output_storage_settings) encoded_d_url = storage.get_url_with_credentials( local_settings, dest_url) logger.debug("encoded_d_url=%s" % encoded_d_url) # FIXME: might want to turn on paramiko compress function #storage_files(encoded_d_url, exceptions=[]) # to speed up this transfer try: storage.copy_directories(encoded_s_url, encoded_d_url) except SSHException, e: logger.error(e) # FIXME: Could just exit, but need to flag that this data has not # been transferred. raise
def start_multi_bootstrap_task(settings, relative_path_suffix): """ Run the package on each of the nodes in the group and grab any output as needed """ nodes = get_registered_vms(settings) logger.debug("nodes=%s" % nodes) requested_nodes = 0 maketarget_nodegroup_pair = {} # TODO: need testcases for following code if not maketarget_nodegroup_pair: EMPTY_MAKE_TARGET = '' requested_nodes = len(nodes) maketarget_nodegroup_pair[EMPTY_MAKE_TARGET] = requested_nodes else: for i in maketarget_nodegroup_pair.keys(): requested_nodes += maketarget_nodegroup_pair[i] if requested_nodes > len(nodes): message = "Requested nodes %d; but available nodes %s " \ % (requested_nodes, len(nodes)) logger.exception(message) raise InsufficientResourceError(message) logger.info("Requested nodes %d: \nAvailable nodes %s " % (requested_nodes, len(nodes))) logger.debug('starting setup') for make_target in maketarget_nodegroup_pair: for i in range(0, maketarget_nodegroup_pair[make_target]): instance = nodes[0] node_ip = instance.ip_address if not node_ip: node_ip = instance.private_ip_address logger.debug("node_ip=%s" % node_ip) logger.debug('constructing source') source = get_url_with_credentials(settings, settings['payload_source']) logger.debug('source=%s' % source) #relative_path = '%s@%s' % (settings['type'], settings['payload_destination']) relative_path = '%s@%s' % (settings['type'], relative_path_suffix) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=node_ip) logger.debug("Source %s" % source) logger.debug("Destination %s" % destination) logger.debug("Relative path %s" % relative_path) _start_bootstrap(instance, node_ip, settings, source, destination) nodes.pop(0)
def run_task(self, ip_address, process_id, settings, run_settings): """ Start the task on the instance, then hang and periodically check its state. """ logger.debug("run_task %s" % ip_address) #ip = botocloudconnector.get_instance_ip(instance_id, settings) #ip = ip_address logger.debug("ip=%s" % ip_address) # curr_username = settings['username'] #settings['username'] = '******' # ssh = sshconnector.open_connection(ip_address=ip, # settings=settings) # settings['username'] = curr_username #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id relative_path_suffix = self.get_relative_output_path(settings) relative_path = settings['type'] + '@' + os.path.join(relative_path_suffix, process_id) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip_address) makefile_path = get_make_path(destination) try: ssh = open_connection(ip_address=ip_address, settings=settings) command, errs = run_make(ssh, makefile_path, 'start_running_process') logger.debug('execute_command=%s' % command ) finally: ssh.close()
def prepare_inputs(self, local_settings, output_storage_settings, computation_platform_settings, mytardis_settings, run_settings): """ Upload all input files for this run """ logger.debug("preparing inputs") # TODO: to ensure reproducability, may want to precalculate all random numbers and # store rather than rely on canonical execution of rest of this funciton. #processes = self.schedule_procs processes = [x for x in self.schedule_procs if x['status'] == 'ready'] self.node_ind = 0 logger.debug("Iteration Input dir %s" % self.iter_inputdir) output_prefix = '%s://%s@' % (output_storage_settings['scheme'], output_storage_settings['type']) url_with_pkey = get_url_with_credentials( output_storage_settings, output_prefix + self.iter_inputdir, is_relative_path=False) logger.debug("url_with_pkey=%s" % url_with_pkey) input_dirs = list_dirs(url_with_pkey) if not input_dirs: raise BadInputException("require an initial subdirectory of input directory") for input_dir in sorted(input_dirs): logger.debug("Input dir %s" % input_dir) self.upload_variation_inputs( run_settings, local_settings, self.generate_variations( input_dir, local_settings, output_storage_settings, run_settings), processes, input_dir, output_storage_settings, computation_platform_settings, mytardis_settings)
def start_round_robin_reschedule(nodes, procs_2b_rescheduled, current_procs, settings, output_storage_settings, relative_path_suffix): total_nodes = len(nodes) all_nodes = list(nodes) processes = len(procs_2b_rescheduled) if total_nodes > processes: total_nodes = processes all_nodes = nodes[:total_nodes] if total_nodes == 0: return proc_per_node = processes / total_nodes remaining_procs = processes % total_nodes index = 0 new_processes = current_procs rescheduled_procs = list(procs_2b_rescheduled) for cur_node in all_nodes: logger.debug('Schedule here %s' % cur_node) ip_address = cur_node.ip_address if not ip_address: ip_address = cur_node.private_ip_address logger.debug('ip_address=%s' % ip_address) #relative_path = output_storage_settings['type'] + '@' + settings['payload_destination'] relative_path = output_storage_settings[ 'type'] + '@' + relative_path_suffix procs_on_cur_node = proc_per_node if remaining_procs: procs_on_cur_node = proc_per_node + 1 remaining_procs -= 1 logger.debug('procs_cur_node=%d' % procs_on_cur_node) ids = get_procs_ids(procs_on_cur_node, rescheduled_procs=rescheduled_procs) #index += len(ids) #logger.debug('index=%d' % index) put_proc_ids(relative_path, ids, ip_address, settings) new_processes = construct_lookup_table(ids, ip_address, new_processes, status='reschedule_ready', maximum_retry=int( settings['maximum_retry'])) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip_address) logger.debug('schedule destination=%s' % destination) makefile_path = get_make_path(destination) logger.debug('makefile_path=%s' % makefile_path) command = "cd %s; make %s" % ( makefile_path, 'start_schedule PAYLOAD_NAME=%s IDS=%s' % (settings['payload_name'], settings['filename_for_PIDs'])) command_out = '' errs = '' logger.debug("starting command for %s" % ip_address) try: ssh = open_connection(ip_address=ip_address, settings=settings) command_out, errs = run_command_with_status(ssh, command) except Exception, e: logger.error(e) finally:
def get_total_templates(self, maps, **kwargs): run_settings = kwargs['run_settings'] output_storage_settings = kwargs['output_storage_settings'] job_dir = kwargs['job_dir'] try: id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA)) except (SettingNotFoundException, ValueError) as e: logger.debug(e) id = 0 iter_inputdir = os.path.join(job_dir, "input_%s" % id) url_with_pkey = get_url_with_credentials( output_storage_settings, '%s://%s@%s' % (output_storage_settings['scheme'], output_storage_settings['type'], iter_inputdir), is_relative_path=False) logger.debug(url_with_pkey) input_dirs = list_dirs(url_with_pkey) for iter, template_map in enumerate(maps): logger.debug("template_map=%s" % template_map) map_keys = template_map.keys() logger.debug("map_keys %s" % map_keys) map_ranges = [list(template_map[x]) for x in map_keys] product = 1 for i in map_ranges: product = product * len(i) total_templates = product * len(input_dirs) logger.debug("total_templates=%d" % (total_templates)) return total_templates
def run_task(self, ip_address, process_id, settings, run_settings): """ Start the task on the instance, then hang and periodically check its state. """ logger.debug("run_task %s" % ip_address) #ip = botocloudconnector.get_instance_ip(instance_id, settings) #ip = ip_address logger.debug("ip=%s" % ip_address) # curr_username = settings['username'] #settings['username'] = '******' # ssh = sshconnector.open_connection(ip_address=ip, # settings=settings) # settings['username'] = curr_username #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id relative_path_suffix = self.get_relative_output_path(settings) relative_path = settings['type'] + '@' + \ os.path.join(relative_path_suffix, process_id) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip_address) makefile_path = get_make_path(destination) try: ssh = open_connection(ip_address=ip_address, settings=settings) logger.debug(settings['process_output_dirname']) try: self.hadoop_input = 'HADOOP_INPUT_%s' % self.contextid self.hadoop_output = 'HADOOP_OUTPUT_%s' % self.contextid hadoop = run_settings['%s/input/system/compplatform/hadoop' % django_settings.SCHEMA_PREFIX] sudo = False options = '%s %s %s %s %s ' % ( settings['smart_connector_input'], settings['process_output_dirname'], settings['hadoop_home_path'], self.hadoop_input, self.hadoop_output) logger.debug('options = %s ' % options) optional_args = self.get_optional_args(run_settings) if optional_args: options += " %s" % optional_args logger.debug('options = %s ' % options) command, errs = run_make(ssh, makefile_path, 'start_running_process %s' % options, sudo=sudo) except KeyError: sudo = True command, errs = run_make(ssh, makefile_path, 'start_running_process %s %s' % (settings['smart_connector_input'], settings['process_output_dirname']), sudo=sudo) logger.debug('execute_command=%s' % command) finally: ssh.close()
def copy_to_scratch_space(self, run_settings, local_settings, result_offset): bdp_username = run_settings['%s/bdp_userprofile' % django_settings.SCHEMA_PREFIX]['username'] output_storage_url = run_settings['%s/platform/storage/output' % django_settings.SCHEMA_PREFIX]['platform_url'] output_storage_settings = manage.get_platform_settings(output_storage_url, bdp_username) run_settings['%s/platform/storage/output' % django_settings.SCHEMA_PREFIX]['offset'] = self.output_loc_offset offset = run_settings['%s/platform/storage/output' % django_settings.SCHEMA_PREFIX]['offset'] self.job_dir = manage.get_job_dir(output_storage_settings, offset) iter_inputdir = os.path.join(self.job_dir, result_offset) logger.debug("iter_inputdir=%s" % iter_inputdir) input_storage_settings = self.get_platform_settings(run_settings, '%s/platform/storage/input' % django_settings.SCHEMA_PREFIX) #input_location = run_settings[django_settings.SCHEMA_PREFIX + '/input/system']['input_location'] try: input_location = getval(run_settings, django_settings.SCHEMA_PREFIX + '/input/system/input_location') except SettingNotFoundException: try: input_location = getval(run_settings, django_settings.SCHEMA_PREFIX + '/input/location/input_location') except: input_location = getval(run_settings, django_settings.SCHEMA_PREFIX + '/input/location/input/input_location') logger.debug("input_location=%s" % input_location) #todo: input location will evenatually be replaced by the scratch space that was used by the sweep #todo: the sweep will indicate the location of the scratch space in the run_settings #todo: add scheme (ssh) to inputlocation #source_url = get_url_with_credentials(local_settings, input_location) input_offset = run_settings['%s/platform/storage/input' % django_settings.SCHEMA_PREFIX]['offset'] input_url = "%s://%s@%s/%s" % (input_storage_settings['scheme'], input_storage_settings['type'], input_storage_settings['host'], input_offset) source_url = get_url_with_credentials( input_storage_settings, input_url, is_relative_path=False) logger.debug("source_url=%s" % source_url) destination_url = get_url_with_credentials( output_storage_settings, '%s://%s@%s' % (output_storage_settings['scheme'], output_storage_settings['type'], iter_inputdir), is_relative_path=False) logger.debug("destination_url=%s" % destination_url) storage.copy_directories(source_url, destination_url)
def get_output(self, ip_address, process_id, output_dir, local_settings, computation_platform_settings, output_storage_settings, run_settings): """ Retrieve the output from the task on the node """ logger.debug("get_output of process %s on %s" % (process_id, ip_address)) output_prefix = '%s://%s@' % (output_storage_settings['scheme'], output_storage_settings['type']) #fixme: add call get_process_output_path #cloud_path = os.path.join(local_settings['payload_destination'], # #str(contextid), #fixme: uncomment # str(process_id), # local_settings['payload_cloud_dirname'] # ) relative_path_suffix = self.get_relative_output_path(local_settings) cloud_path = os.path.join(relative_path_suffix, str(process_id), local_settings['payload_cloud_dirname']) #cloud_path = self.get_process_output_path(run_settings, process_id) logger.debug("cloud_path=%s" % cloud_path) logger.debug("Transferring output from %s to %s" % (cloud_path, output_dir)) ip = ip_address # botocloudconnector.get_instance_ip(instance_id, settings) #ssh = open_connection(ip_address=ip, settings=settings) source_files_location = "%s://%s@%s" % ( computation_platform_settings['scheme'], computation_platform_settings['type'], os.path.join( ip, cloud_path)) source_files_url = get_url_with_credentials( computation_platform_settings, source_files_location, is_relative_path=False) logger.debug('source_files_url=%s' % source_files_url) dest_files_url = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.job_dir, self.output_dir, process_id), is_relative_path=False) logger.debug('dest_files_url=%s' % dest_files_url) # FIXME: might want to turn on paramiko compress function # to speed up this transfer storage.copy_directories(source_files_url, dest_files_url)
def process(self, run_settings): self.experiment_id = 0 local_settings = setup_settings(run_settings) self.experiment_id = local_settings['experiment_id'] messages.info(run_settings, "1: waiting for completion") logger.debug("settings=%s" % local_settings) try: self.runs_left = ast.literal_eval( getval(run_settings, '%s/stages/make/runs_left' % RMIT_SCHEMA)) except (ValueError, SettingNotFoundException): self.runs_left = [] # if self._exists(run_settings, # 'http://rmit.edu.au/schemas/stages/make', # u'runs_left'): # self.runs_left = ast.literal_eval( # run_settings['http://rmit.edu.au/schemas/stages/make'][u'runs_left']) # else: # self.runs_left = [] def _get_dest_bdp_url(local_settings): return "%s@%s" % ("nci", os.path.join( local_settings['payload_destination'], str(local_settings['contextid']))) dest_url = _get_dest_bdp_url(local_settings) computation_platform_url = local_settings['comp_platform_url'] bdp_username = local_settings['bdp_username'] comp_pltf_settings = manage.get_platform_settings( computation_platform_url, bdp_username) local_settings.update(comp_pltf_settings) encoded_d_url = storage.get_url_with_credentials( local_settings, dest_url, is_relative_path=True, ip_address=local_settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_d_url) if self.runs_left: job_finished = self._job_finished(settings=local_settings, remote_path=dest_url) if not job_finished: return self._get_output(local_settings, dest_url) self.runs_left -= 1 if self.runs_left <= 0: messages.success(run_settings, "%s: finished" % (1)) logger.debug("processing finished")
def complete_bootstrap(bootstrap_class, local_settings): try: nodes = ast.literal_eval(local_settings['created_nodes']) logger.debug("nodes=%s" % nodes) running_created_nodes = [ x for x in bootstrap_class.created_nodes if str(x[3]) == 'running' ] if len(nodes) < len(running_created_nodes): raise VMTerminatedError except NoRegisteredVMError as e: logger.debug('NoRegisteredVMError detected') ftmanager = FTManager() ftmanager.manage_failure(e, stage_class=bootstrap_class, settings=local_settings) except VMTerminatedError as e: logger.debug('VMTerminatedError detected') ftmanager = FTManager() ftmanager.manage_failure(e, stage_class=bootstrap_class, settings=local_settings) for node in nodes: node_ip = node[1] if (node_ip in [ x[1] for x in bootstrap_class.bootstrapped_nodes if x[1] == node_ip ]): continue relative_path_suffix = bootstrap_class.get_relative_output_path( local_settings) relative_path = "%s@%s" % (local_settings['type'], relative_path_suffix) destination = get_url_with_credentials(local_settings, relative_path, is_relative_path=True, ip_address=node_ip) logger.debug("Relative path %s" % relative_path) logger.debug("Destination %s" % destination) try: fin = _is_bootstrap_complete(node_ip, local_settings, destination) except IOError, e: logger.error(e) fin = False except Exception as e: logger.error(e) fin = False ftmanager = FTManager() ftmanager.manage_failure(e, stage_class=bootstrap_class, vm_ip=node_ip, vm_id=node[0], settings=local_settings)
def start_round_robin_schedule(nodes, processes, schedule_index, settings, relative_path_suffix): total_nodes = len(nodes) all_nodes = list(nodes) if total_nodes > processes: total_nodes = processes all_nodes = nodes[:total_nodes] if total_nodes == 0: return proc_per_node = processes / total_nodes remaining_procs = processes % total_nodes index = schedule_index new_processes = [] for cur_node in all_nodes: ip_address = cur_node[1] #relative_path = settings['type'] + '@' + settings['payload_destination'] relative_path = settings['type'] + '@' + relative_path_suffix procs_on_cur_node = proc_per_node if remaining_procs: procs_on_cur_node = proc_per_node + 1 remaining_procs -= 1 logger.debug('procs_cur_node=%d' % procs_on_cur_node) ids = get_procs_ids(procs_on_cur_node, index=index) index += len(ids) logger.debug('index=%d' % index) put_proc_ids(relative_path, ids, ip_address, settings) new_processes = construct_lookup_table( ids, ip_address, new_processes, maximum_retry=int(settings['maximum_retry'])) destination = get_url_with_credentials( settings, relative_path, is_relative_path=True, ip_address=ip_address) logger.debug('schedule destination=%s' % destination) makefile_path = get_make_path(destination) logger.debug('makefile_path=%s' % makefile_path) command = "cd %s; make %s" % (makefile_path, 'start_schedule %s %s %s %s' % ( settings['payload_name'], settings['filename_for_PIDs'], settings['process_output_dirname'], settings['smart_connector_input'])) command_out = '' errs = '' logger.debug("starting command for %s" % ip_address) try: ssh = open_connection(ip_address=ip_address, settings=settings) command_out, errs = run_command_with_status(ssh, command) except Exception, e: logger.error(e) finally:
def _load_values_map(settings, url): values = {} try: enc_url = storage.get_url_with_credentials( settings, "%s/%s" % (url, VALUES_FNAME)) logger.debug("values_file=%s" % enc_url) values_content = storage.get_file(enc_url) except IOError: logger.warn("no values file found") else: logger.debug("values_content = %s" % values_content) values = dict(json.loads(values_content)) return values
def put_proc_ids(relative_path, ids, ip, settings): relative_path = os.path.join(relative_path, settings['filename_for_PIDs']) logger.debug('put_proc_ids=%s' % relative_path) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip) logger.debug('destination=%s' % destination) ids_str = [] [ids_str.append(str(i)) for i in ids] proc_ids = ("\n".join(ids_str)) + "\n" logger.debug('ids_str=%s' % ids_str) logger.debug('proc_ids=%s' % proc_ids) logger.debug('encoded=%s' % proc_ids.encode('utf-8')) put_file(destination, proc_ids.encode('utf-8'))
def get_dataset_name_for_output(settings, url, path): logger.debug("path=%s" % path) host = settings['host'] prefix = 'ssh://%s@%s' % (settings['type'], host) source_url = get_url_with_credentials( settings, os.path.join(prefix, path, "HRMC.inp_values"), is_relative_path=False) logger.debug("source_url=%s" % source_url) try: content = storage.get_file(source_url) except IOError, e: logger.warn("cannot read file %s" % e) return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))
def put_dest_file(proc, fname, dest_file_location, resched_file_location, content): dest_url = get_url_with_credentials( computation_platform_settings, os.path.join(dest_file_location, fname), is_relative_path=True, ip_address=proc['ip_address']) logger.debug("writing to =%s" % dest_url) #logger.debug("content=%s" % content) storage.put_file(dest_url, content) if self.reschedule_failed_procs: logger.debug("resched=%s" % resched_file_location) logger.debug("fname=%s" % fname) logger.debug("output_storage_settings=%s" % output_storage_settings) logger.debug("here") test = "%s/%s" % (resched_file_location, fname) logger.debug("test=%s" % test) resched_url = get_url_with_credentials( output_storage_settings, test) logger.debug("writing backup to %s" % resched_url) storage.put_file(resched_url, content) logger.debug("done")
def _get_dataset_name_for_input(settings, url, path): logger.debug("path=%s" % path) source_url = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(output_host, path, self.VALUES_FNAME), is_relative_path=False) logger.debug("source_url=%s" % source_url) try: content = get_file(source_url) except IOError: return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) logger.debug("content=%s" % content) try: values_map = dict(json.loads(str(content))) except Exception, e: logger.warn("cannot load %s: %s" % (content, e)) return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:]))
def generate_rands(settings, start_range, end_range, num_required, start_index): # FIXME: there must be an third party library that does this more # effectively. rand_nums = [] num_url = get_url_with_credentials(settings, settings['random_numbers'], is_relative_path=False) random_content = get_file(num_url) # FIXME: this loads the entire file, which could be very large. numbers = random_content.split('\n') random_counter = start_index # FIXME: better handled with separate function if end_range < start_range: # special case, where we want rands in range of number of rands in file start_range = 0 end_range = len(numbers) for i in range(0, num_required): raw_num = float(numbers[random_counter]) num = int((raw_num * float(end_range - start_range)) + start_range) rand_nums.append(num) logger.debug("[0,1) %s -> [%s,%s) %s" % (raw_num, start_range, end_range, num)) random_counter += 1 if random_counter >= len(numbers): random_counter = 0 # for i, line in enumerate(random_content.split('\n')): # if start_index <= i < (start_index + num_required): # raw_num = float(line) # num = int((raw_num * float(end_range - start_range)) + start_range) # logger.debug("[0,1) %s -> [%s,%s) %s" % (raw_num, start_range, end_range, num)) # rand_nums.append(num) logger.debug( "Generated %s random numbers from %s in range [%s, %s): %s " % (num_required, num_url, start_range, end_range, pformat(rand_nums))) return rand_nums
def curate_dataset(self, run_settings, experiment_id, base_dir, output_url, all_settings): (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url) logger.debug("output_url=%s" % output_url) output_settings = self.get_platform_settings(run_settings, RMIT_SCHEMA + '/platform/storage/output') current_output_url= "%s://%s@%s/%s" %(scheme, output_settings['type'], host, os.path.join(mypath, '1/')) logger.debug('current-dest=%s' % current_output_url) outcar_url = storage.get_url_with_credentials( output_settings, current_output_url +self.OUTCAR_FILE, is_relative_path=False) logger.debug("outcar_url=%s" % outcar_url) try: outcar_content = storage.get_file(outcar_url) except IOError, e: logger.error(e) toten = None
def _job_finished(self, settings, remote_path): encoded_d_url = storage.get_url_with_credentials( settings=settings, url_or_relative_path=remote_path, is_relative_path=True, ip_address=settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_d_url) stdout = '' stderr = '' try: ssh = open_connection(ip_address=host, settings=settings) (stdout, stderr) = compute.run_make( ssh, (os.path.join(query_settings['root_path'], mypath)), 'running') except Exception, e: logger.error(e) raise
def compute_hrmc_criterion(self, number, node_output_dir, fs, output_storage_settings): output_prefix = '%s://%s@' % (output_storage_settings['scheme'], output_storage_settings['type']) grerr_file = 'grerr%s.dat' % str(number).zfill(2) logger.debug("grerr_file=%s " % grerr_file) grerr_url = get_url_with_credentials( output_storage_settings, output_prefix + os.path.join(self.output_dir, node_output_dir, 'grerr%s.dat' % str(number).zfill(2)), is_relative_path=False) grerr_content = storage.get_file( grerr_url) # FIXME: check that get_file can raise IOError if not grerr_content: logger.warn("no gerr content found") logger.debug("grerr_content=%s" % grerr_content) try: criterion = float(grerr_content.strip().split('\n')[-1].split()[1]) except ValueError as e: logger.warn("invalid criteron found in grerr " + "file for %s/%s: %s" % (self.output_dir, node_output_dir, e)) logger.debug("criterion=%s" % criterion) return criterion
def process(self, run_settings): settings = setup_settings(run_settings) messages.info(run_settings, "1: execute starting") def _get_dest_bdp_url(settings): return "%s@%s" % ( "nci", os.path.join(settings['payload_destination'], str(settings['contextid']))) dest_url = _get_dest_bdp_url(settings) computation_platform_url = settings['comp_platform_url'] bdp_username = settings['bdp_username'] comp_pltf_settings = manage.get_platform_settings( computation_platform_url, bdp_username) logger.debug("comp_pltf_settings=%s" % pformat(comp_pltf_settings)) settings.update(comp_pltf_settings) encoded_d_url = storage.get_url_with_credentials( settings, dest_url, is_relative_path=True, ip_address=settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_d_url) stderr = '' try: ssh = open_connection( ip_address=settings['host'], settings=settings) (command_out, stderr) = compute.run_make(ssh, (os.path.join( query_settings['root_path'], mypath)), 'startrun') except Exception, e: logger.error(e) raise
mytardis_settings = _get_mytardis_settings(local_settings, bdp_username) logger.debug(mytardis_settings) if local_settings['curate_data']: if mytardis_settings['mytardis_host']: if directive == "vasp": # TODO: this is very domain specific OUTCAR_FILE = "OUTCAR" VALUES_FILE = "values" outcar_url = storage.get_url_with_credentials( local_settings, os.path.join(dest_url, OUTCAR_FILE), is_relative_path=False) logger.debug("outcar_url=%s" % outcar_url) try: outcar_content = storage.get_file(outcar_url) except IOError, e: logger.error(e) toten = None else: toten = None for line in outcar_content.split('\n'): #logger.debug("line=%s" % line) if 'e en' in line: logger.debug("found") try:
def is_job_finished(self, wait_class, ip_address, process_id, retry_left, settings, relative_path_suffix): """ Return True if package job on instance_id has is_job_finished """ # TODO: maybe this should be a reusable library method? ip = ip_address logger.debug("ip=%s" % ip) curr_username = settings['username'] # settings['username'] = '******' #relative_path = settings['type'] + '@' + settings['payload_destination'] + "/" + process_id relative_path = settings['type'] + '@' + os.path.join( relative_path_suffix, process_id) destination = get_url_with_credentials(settings, relative_path, is_relative_path=True, ip_address=ip) makefile_path = get_make_path(destination) ssh = None try: logger.debug('trying ssh') ssh = open_connection(ip_address=ip, settings=settings) logger.debug('successful ssh') (command_out, errs) = run_make(ssh, makefile_path, "process_running_done") ssh.close() logger.debug("command_out2=(%s, %s)" % (command_out, errs)) if command_out: logger.debug("command_out = %s" % command_out) for line in command_out: if "stopped" in line: return True except Exception, e: # Failure detection and then management logger.debug('error is = %s' % e) process_failed = False node_failed = False logger.debug('Is there error? %s' % wait_class.failure_detector.failed_ssh_connection(e)) if wait_class.failure_detector.failed_ssh_connection(e): node = [ x for x in wait_class.created_nodes if x[1] == ip_address ] wait_class.failed_processes = wait_class.ftmanager.manage_failed_process( settings, process_id, node[0], node[0][0], ip_address, wait_class.failed_nodes, wait_class.executed_procs, wait_class.current_processes, wait_class.all_processes, wait_class.procs_2b_rescheduled) #wait_class.procs_2b_rescheduled.extend(rescheduled_prcs) ''' if wait_class.failure_detector.node_terminated(settings, node[0][0]): if not wait_class.failure_detector.recorded_failed_node( wait_class.failed_nodes, ip_address): wait_class.failed_nodes.append(node[0]) node_failed = True else: if not retry_left: process_failed = True else: process_lists = [wait_class.executed_procs, wait_class.current_processes, wait_class.all_processes] wait_class.ftmanager.decrease_max_retry( process_lists, ip_address, process_id) # Failure management if node_failed or process_failed: process_lists = [wait_class.executed_procs, wait_class.current_processes, wait_class.all_processes] if node_failed: wait_class.ftmanager.flag_all_processes(process_lists, ip_address) elif process_failed: wait_class.ftmanager.flag_this_process( process_lists, ip_address, process_id) wait_class.failed_processes = wait_class.ftmanager.\ get_total_failed_processes(wait_class.executed_procs) if wait_class.reschedule_failed_procs: wait_class.ftmanager.collect_failed_processes( wait_class.executed_procs, wait_class.procs_2b_rescheduled) ''' else: raise
def curate_dataset(self, run_settings, experiment_id, base_url, output_url, all_settings): ''' Curates dataset ''' # Retrieves process directories below the current output location iteration = int(getval(run_settings, '%s/system/id' % SCHEMA_PREFIX)) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) current_output_url = "%s%s" % ( output_prefix, os.path.join(os.path.join(base_url, "output_%s" % iteration))) (scheme, host, current_output_path, location, query_settings) = storage.parse_bdpurl(output_url) output_fsys = storage.get_filesystem(output_url) process_output_dirs, _ = output_fsys.listdir(current_output_path) # Curates a dataset with metadata per process for i, process_output_dir in enumerate(process_output_dirs): # Expand the process output directory and add credentials for access process_output_url = '/'.join( [current_output_url, process_output_dir]) process_output_url_with_cred = get_url_with_credentials( all_settings, process_output_url, is_relative_path=False) # Expand the process output file and add credentials for access output_file_url_with_cred = storage.get_url_with_credentials( all_settings, '/'.join([process_output_url, OUTPUT_FILE]), is_relative_path=False) try: output_content = storage.get_file(output_file_url_with_cred) val1, val2 = output_content.split() except (IndexError, IOError) as e: logger.warn(e) continue try: x = float(val1) y = float(val2) except (ValueError, IndexError) as e: logger.warn(e) continue # Returns the process id as MyTardis dataset name all_settings['graph_point_id'] = str(i) def _get_dataset_name(settings, url, path): return all_settings['graph_point_id'] # Creates new dataset and adds to experiment # If experiment_id==0, creates new experiment experiment_id = mytardis.create_dataset( settings=all_settings, # MyTardis credentials source_url=process_output_url_with_cred, exp_id=experiment_id, dataset_name= _get_dataset_name, # the function that defines dataset name dataset_paramset=[ # a new blank parameter set conforming to schema 'remotemake/output' mytardis.create_paramset("remotemake/output", []), mytardis.create_graph_paramset( "dsetgraph", # name of schema name="randdset", # a unique dataset name graph_info={}, value_dict={ "randdset/x": x, "randdset/y": y }, # values to be used in experiment graphs value_keys=[]), ]) return experiment_id